546063e728db1cd9e3bb20c9d9276d7d399a7d49
[cascardo/linux.git] / drivers / staging / lustre / lustre / llite / llite_lib.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2015, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * lustre/llite/llite_lib.c
33  *
34  * Lustre Light Super operations
35  */
36
37 #define DEBUG_SUBSYSTEM S_LLITE
38
39 #include <linux/module.h>
40 #include <linux/statfs.h>
41 #include <linux/types.h>
42 #include <linux/mm.h>
43
44 #include "../include/lustre_lite.h"
45 #include "../include/lustre_ha.h"
46 #include "../include/lustre_dlm.h"
47 #include "../include/lprocfs_status.h"
48 #include "../include/lustre_disk.h"
49 #include "../include/lustre_param.h"
50 #include "../include/lustre_log.h"
51 #include "../include/cl_object.h"
52 #include "../include/obd_cksum.h"
53 #include "llite_internal.h"
54
55 struct kmem_cache *ll_file_data_slab;
56 struct dentry *llite_root;
57 struct kset *llite_kset;
58
59 #ifndef log2
60 #define log2(n) ffz(~(n))
61 #endif
62
63 static struct ll_sb_info *ll_init_sbi(struct super_block *sb)
64 {
65         struct ll_sb_info *sbi = NULL;
66         unsigned long pages;
67         unsigned long lru_page_max;
68         struct sysinfo si;
69         class_uuid_t uuid;
70         int i;
71
72         sbi = kzalloc(sizeof(*sbi), GFP_NOFS);
73         if (!sbi)
74                 return NULL;
75
76         spin_lock_init(&sbi->ll_lock);
77         mutex_init(&sbi->ll_lco.lco_lock);
78         spin_lock_init(&sbi->ll_pp_extent_lock);
79         spin_lock_init(&sbi->ll_process_lock);
80         sbi->ll_rw_stats_on = 0;
81
82         si_meminfo(&si);
83         pages = si.totalram - si.totalhigh;
84         lru_page_max = pages / 2;
85
86         sbi->ll_cache = cl_cache_init(lru_page_max);
87         if (!sbi->ll_cache) {
88                 kfree(sbi);
89                 return NULL;
90         }
91
92         sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
93                                            SBI_DEFAULT_READAHEAD_MAX);
94         sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
95         sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
96                                            SBI_DEFAULT_READAHEAD_WHOLE_MAX;
97
98         ll_generate_random_uuid(uuid);
99         class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
100         CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
101
102         sbi->ll_flags |= LL_SBI_VERBOSE;
103         sbi->ll_flags |= LL_SBI_CHECKSUM;
104
105         sbi->ll_flags |= LL_SBI_LRU_RESIZE;
106
107         for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
108                 spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
109                                pp_r_hist.oh_lock);
110                 spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
111                                pp_w_hist.oh_lock);
112         }
113
114         /* metadata statahead is enabled by default */
115         sbi->ll_sa_max = LL_SA_RPC_DEF;
116         atomic_set(&sbi->ll_sa_total, 0);
117         atomic_set(&sbi->ll_sa_wrong, 0);
118         atomic_set(&sbi->ll_agl_total, 0);
119         sbi->ll_flags |= LL_SBI_AGL_ENABLED;
120
121         sbi->ll_sb = sb;
122
123         return sbi;
124 }
125
126 static void ll_free_sbi(struct super_block *sb)
127 {
128         struct ll_sb_info *sbi = ll_s2sbi(sb);
129
130         if (sbi->ll_cache) {
131                 cl_cache_decref(sbi->ll_cache);
132                 sbi->ll_cache = NULL;
133         }
134
135         kfree(sbi);
136 }
137
138 static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
139                                     struct vfsmount *mnt)
140 {
141         struct inode *root = NULL;
142         struct ll_sb_info *sbi = ll_s2sbi(sb);
143         struct obd_device *obd;
144         struct obd_statfs *osfs = NULL;
145         struct ptlrpc_request *request = NULL;
146         struct obd_connect_data *data = NULL;
147         struct obd_uuid *uuid;
148         struct md_op_data *op_data;
149         struct lustre_md lmd;
150         u64 valid;
151         int size, err, checksum;
152
153         obd = class_name2obd(md);
154         if (!obd) {
155                 CERROR("MD %s: not setup or attached\n", md);
156                 return -EINVAL;
157         }
158
159         data = kzalloc(sizeof(*data), GFP_NOFS);
160         if (!data)
161                 return -ENOMEM;
162
163         osfs = kzalloc(sizeof(*osfs), GFP_NOFS);
164         if (!osfs) {
165                 kfree(data);
166                 return -ENOMEM;
167         }
168
169         /* indicate the features supported by this client */
170         data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
171                                   OBD_CONNECT_ATTRFID  |
172                                   OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
173                                   OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
174                                   OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
175                                   OBD_CONNECT_VBR       | OBD_CONNECT_FULL20  |
176                                   OBD_CONNECT_64BITHASH |
177                                   OBD_CONNECT_EINPROGRESS |
178                                   OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
179                                   OBD_CONNECT_LAYOUTLOCK |
180                                   OBD_CONNECT_PINGLESS |
181                                   OBD_CONNECT_MAX_EASIZE |
182                                   OBD_CONNECT_FLOCK_DEAD |
183                                   OBD_CONNECT_DISP_STRIPE;
184
185         if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
186                 data->ocd_connect_flags |= OBD_CONNECT_SOM;
187
188         if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
189                 data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
190 #ifdef CONFIG_FS_POSIX_ACL
191         data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK;
192 #endif
193
194         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
195                 /* flag mdc connection as lightweight, only used for test
196                  * purpose, use with care
197                  */
198                 data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
199
200         data->ocd_ibits_known = MDS_INODELOCK_FULL;
201         data->ocd_version = LUSTRE_VERSION_CODE;
202
203         if (sb->s_flags & MS_RDONLY)
204                 data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
205         if (sbi->ll_flags & LL_SBI_USER_XATTR)
206                 data->ocd_connect_flags |= OBD_CONNECT_XATTR;
207
208         if (sbi->ll_flags & LL_SBI_FLOCK)
209                 sbi->ll_fop = &ll_file_operations_flock;
210         else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
211                 sbi->ll_fop = &ll_file_operations;
212         else
213                 sbi->ll_fop = &ll_file_operations_noflock;
214
215         /* real client */
216         data->ocd_connect_flags |= OBD_CONNECT_REAL;
217
218         data->ocd_brw_size = MD_MAX_BRW_SIZE;
219
220         err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid,
221                           data, NULL);
222         if (err == -EBUSY) {
223                 LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n",
224                                    md);
225                 goto out;
226         } else if (err) {
227                 CERROR("cannot connect to %s: rc = %d\n", md, err);
228                 goto out;
229         }
230
231         sbi->ll_md_exp->exp_connect_data = *data;
232
233         err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
234                            LUSTRE_SEQ_METADATA);
235         if (err) {
236                 CERROR("%s: Can't init metadata layer FID infrastructure, rc = %d\n",
237                        sbi->ll_md_exp->exp_obd->obd_name, err);
238                 goto out_md;
239         }
240
241         /* For mount, we only need fs info from MDT0, and also in DNE, it
242          * can make sure the client can be mounted as long as MDT0 is
243          * available
244          */
245         err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
246                          cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
247                          OBD_STATFS_FOR_MDT0);
248         if (err)
249                 goto out_md_fid;
250
251         /* This needs to be after statfs to ensure connect has finished.
252          * Note that "data" does NOT contain the valid connect reply.
253          * If connecting to a 1.8 server there will be no LMV device, so
254          * we can access the MDC export directly and exp_connect_flags will
255          * be non-zero, but if accessing an upgraded 2.1 server it will
256          * have the correct flags filled in.
257          * XXX: fill in the LMV exp_connect_flags from MDC(s).
258          */
259         valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
260         if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
261             valid != CLIENT_CONNECT_MDT_REQD) {
262                 char *buf;
263
264                 buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
265                 if (!buf) {
266                         err = -ENOMEM;
267                         goto out_md_fid;
268                 }
269                 obd_connect_flags2str(buf, PAGE_SIZE,
270                                       valid ^ CLIENT_CONNECT_MDT_REQD, ",");
271                 LCONSOLE_ERROR_MSG(0x170, "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n",
272                                    sbi->ll_md_exp->exp_obd->obd_name, buf);
273                 kfree(buf);
274                 err = -EPROTO;
275                 goto out_md_fid;
276         }
277
278         size = sizeof(*data);
279         err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
280                            KEY_CONN_DATA,  &size, data, NULL);
281         if (err) {
282                 CERROR("%s: Get connect data failed: rc = %d\n",
283                        sbi->ll_md_exp->exp_obd->obd_name, err);
284                 goto out_md_fid;
285         }
286
287         LASSERT(osfs->os_bsize);
288         sb->s_blocksize = osfs->os_bsize;
289         sb->s_blocksize_bits = log2(osfs->os_bsize);
290         sb->s_magic = LL_SUPER_MAGIC;
291         sb->s_maxbytes = MAX_LFS_FILESIZE;
292         sbi->ll_namelen = osfs->os_namelen;
293
294         if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
295             !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
296                 LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
297                 sbi->ll_flags &= ~LL_SBI_USER_XATTR;
298         }
299
300         if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
301                 sb->s_flags |= MS_POSIXACL;
302                 sbi->ll_flags |= LL_SBI_ACL;
303         } else {
304                 LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
305                 sb->s_flags &= ~MS_POSIXACL;
306                 sbi->ll_flags &= ~LL_SBI_ACL;
307         }
308
309         if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
310                 sbi->ll_flags |= LL_SBI_64BIT_HASH;
311
312         if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
313                 sbi->ll_md_brw_size = data->ocd_brw_size;
314         else
315                 sbi->ll_md_brw_size = PAGE_SIZE;
316
317         if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK)
318                 sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
319
320         if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) {
321                 if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) {
322                         LCONSOLE_INFO(
323                                 "%s: disabling xattr cache due to unknown maximum xattr size.\n",
324                                 dt);
325                 } else {
326                         sbi->ll_flags |= LL_SBI_XATTR_CACHE;
327                         sbi->ll_xattr_cache_enabled = 1;
328                 }
329         }
330
331         obd = class_name2obd(dt);
332         if (!obd) {
333                 CERROR("DT %s: not setup or attached\n", dt);
334                 err = -ENODEV;
335                 goto out_md_fid;
336         }
337
338         data->ocd_connect_flags = OBD_CONNECT_GRANT     | OBD_CONNECT_VERSION  |
339                                   OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
340                                   OBD_CONNECT_CANCELSET | OBD_CONNECT_FID      |
341                                   OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK|
342                                   OBD_CONNECT_AT        | OBD_CONNECT_OSS_CAPA |
343                                   OBD_CONNECT_VBR       | OBD_CONNECT_FULL20   |
344                                   OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES |
345                                   OBD_CONNECT_EINPROGRESS |
346                                   OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
347                                   OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
348
349         if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
350                 data->ocd_connect_flags |= OBD_CONNECT_SOM;
351
352         if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
353                 /* OBD_CONNECT_CKSUM should always be set, even if checksums are
354                  * disabled by default, because it can still be enabled on the
355                  * fly via /sys. As a consequence, we still need to come to an
356                  * agreement on the supported algorithms at connect time
357                  */
358                 data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
359
360                 if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
361                         data->ocd_cksum_types = OBD_CKSUM_ADLER;
362                 else
363                         data->ocd_cksum_types = cksum_types_supported_client();
364         }
365
366         data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
367
368         CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d\n",
369                data->ocd_connect_flags,
370                data->ocd_version, data->ocd_grant);
371
372         obd->obd_upcall.onu_owner = &sbi->ll_lco;
373         obd->obd_upcall.onu_upcall = cl_ocd_update;
374
375         data->ocd_brw_size = DT_MAX_BRW_SIZE;
376
377         err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
378                           NULL);
379         if (err == -EBUSY) {
380                 LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing recovery, of which this client is not a part.  Please wait for recovery to complete, abort, or time out.\n",
381                                    dt);
382                 goto out_md;
383         } else if (err) {
384                 CERROR("%s: Cannot connect to %s: rc = %d\n",
385                        sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
386                 goto out_md;
387         }
388
389         sbi->ll_dt_exp->exp_connect_data = *data;
390
391         err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
392                            LUSTRE_SEQ_METADATA);
393         if (err) {
394                 CERROR("%s: Can't init data layer FID infrastructure, rc = %d\n",
395                        sbi->ll_dt_exp->exp_obd->obd_name, err);
396                 goto out_dt;
397         }
398
399         mutex_lock(&sbi->ll_lco.lco_lock);
400         sbi->ll_lco.lco_flags = data->ocd_connect_flags;
401         sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
402         sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
403         mutex_unlock(&sbi->ll_lco.lco_lock);
404
405         fid_zero(&sbi->ll_root_fid);
406         err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid);
407         if (err) {
408                 CERROR("cannot mds_connect: rc = %d\n", err);
409                 goto out_lock_cn_cb;
410         }
411         if (!fid_is_sane(&sbi->ll_root_fid)) {
412                 CERROR("%s: Invalid root fid "DFID" during mount\n",
413                        sbi->ll_md_exp->exp_obd->obd_name,
414                        PFID(&sbi->ll_root_fid));
415                 err = -EINVAL;
416                 goto out_lock_cn_cb;
417         }
418         CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
419
420         sb->s_op = &lustre_super_operations;
421 #if THREAD_SIZE >= 8192 /*b=17630*/
422         sb->s_export_op = &lustre_export_operations;
423 #endif
424
425         /* make root inode
426          * XXX: move this to after cbd setup?
427          */
428         valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMODEASIZE;
429         if (sbi->ll_flags & LL_SBI_ACL)
430                 valid |= OBD_MD_FLACL;
431
432         op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
433         if (!op_data) {
434                 err = -ENOMEM;
435                 goto out_lock_cn_cb;
436         }
437
438         op_data->op_fid1 = sbi->ll_root_fid;
439         op_data->op_mode = 0;
440         op_data->op_valid = valid;
441
442         err = md_getattr(sbi->ll_md_exp, op_data, &request);
443         kfree(op_data);
444         if (err) {
445                 CERROR("%s: md_getattr failed for root: rc = %d\n",
446                        sbi->ll_md_exp->exp_obd->obd_name, err);
447                 goto out_lock_cn_cb;
448         }
449
450         err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
451                                sbi->ll_md_exp, &lmd);
452         if (err) {
453                 CERROR("failed to understand root inode md: rc = %d\n", err);
454                 ptlrpc_req_finished(request);
455                 goto out_lock_cn_cb;
456         }
457
458         LASSERT(fid_is_sane(&sbi->ll_root_fid));
459         root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
460                                             sbi->ll_flags & LL_SBI_32BIT_API),
461                        &lmd);
462         md_free_lustre_md(sbi->ll_md_exp, &lmd);
463         ptlrpc_req_finished(request);
464
465         if (!(root)) {
466                 if (lmd.lsm)
467                         obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
468 #ifdef CONFIG_FS_POSIX_ACL
469                 if (lmd.posix_acl) {
470                         posix_acl_release(lmd.posix_acl);
471                         lmd.posix_acl = NULL;
472                 }
473 #endif
474                 err = -EBADF;
475                 CERROR("lustre_lite: bad iget4 for root\n");
476                 goto out_root;
477         }
478
479         err = ll_close_thread_start(&sbi->ll_lcq);
480         if (err) {
481                 CERROR("cannot start close thread: rc %d\n", err);
482                 goto out_root;
483         }
484
485         checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
486         err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
487                                  KEY_CHECKSUM, sizeof(checksum), &checksum,
488                                  NULL);
489         cl_sb_init(sb);
490
491         err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
492                                  KEY_CACHE_SET, sizeof(*sbi->ll_cache),
493                                  sbi->ll_cache, NULL);
494
495         sb->s_root = d_make_root(root);
496         if (!sb->s_root) {
497                 CERROR("%s: can't make root dentry\n",
498                        ll_get_fsname(sb, NULL, 0));
499                 err = -ENOMEM;
500                 goto out_lock_cn_cb;
501         }
502
503         sbi->ll_sdev_orig = sb->s_dev;
504
505         /* We set sb->s_dev equal on all lustre clients in order to support
506          * NFS export clustering.  NFSD requires that the FSID be the same
507          * on all clients.
508          */
509         /* s_dev is also used in lt_compare() to compare two fs, but that is
510          * only a node-local comparison.
511          */
512         uuid = obd_get_uuid(sbi->ll_md_exp);
513         if (uuid) {
514                 sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
515                 get_uuid2fsid(uuid->uuid, strlen(uuid->uuid), &sbi->ll_fsid);
516         }
517
518         kfree(data);
519         kfree(osfs);
520
521         if (llite_root) {
522                 err = ldebugfs_register_mountpoint(llite_root, sb, dt, md);
523                 if (err < 0) {
524                         CERROR("%s: could not register mount in debugfs: "
525                                "rc = %d\n", ll_get_fsname(sb, NULL, 0), err);
526                         err = 0;
527                 }
528         }
529
530         return err;
531 out_root:
532         iput(root);
533 out_lock_cn_cb:
534         obd_fid_fini(sbi->ll_dt_exp->exp_obd);
535 out_dt:
536         obd_disconnect(sbi->ll_dt_exp);
537         sbi->ll_dt_exp = NULL;
538 out_md_fid:
539         obd_fid_fini(sbi->ll_md_exp->exp_obd);
540 out_md:
541         obd_disconnect(sbi->ll_md_exp);
542         sbi->ll_md_exp = NULL;
543 out:
544         kfree(data);
545         kfree(osfs);
546         return err;
547 }
548
549 int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
550 {
551         int size, rc;
552
553         *lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL);
554         size = sizeof(int);
555         rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
556                           KEY_MAX_EASIZE, &size, lmmsize, NULL);
557         if (rc)
558                 CERROR("Get max mdsize error rc %d\n", rc);
559
560         return rc;
561 }
562
563 int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize)
564 {
565         int size, rc;
566
567         size = sizeof(int);
568         rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE),
569                           KEY_DEFAULT_EASIZE, &size, lmmsize, NULL);
570         if (rc)
571                 CERROR("Get default mdsize error rc %d\n", rc);
572
573         return rc;
574 }
575
576 static void client_common_put_super(struct super_block *sb)
577 {
578         struct ll_sb_info *sbi = ll_s2sbi(sb);
579
580         ll_close_thread_shutdown(sbi->ll_lcq);
581
582         cl_sb_fini(sb);
583
584         obd_fid_fini(sbi->ll_dt_exp->exp_obd);
585         obd_disconnect(sbi->ll_dt_exp);
586         sbi->ll_dt_exp = NULL;
587
588         ldebugfs_unregister_mountpoint(sbi);
589
590         obd_fid_fini(sbi->ll_md_exp->exp_obd);
591         obd_disconnect(sbi->ll_md_exp);
592         sbi->ll_md_exp = NULL;
593 }
594
595 void ll_kill_super(struct super_block *sb)
596 {
597         struct ll_sb_info *sbi;
598
599         /* not init sb ?*/
600         if (!(sb->s_flags & MS_ACTIVE))
601                 return;
602
603         sbi = ll_s2sbi(sb);
604         /* we need to restore s_dev from changed for clustered NFS before
605          * put_super because new kernels have cached s_dev and change sb->s_dev
606          * in put_super not affected real removing devices
607          */
608         if (sbi) {
609                 sb->s_dev = sbi->ll_sdev_orig;
610                 sbi->ll_umounting = 1;
611         }
612 }
613
614 static inline int ll_set_opt(const char *opt, char *data, int fl)
615 {
616         if (strncmp(opt, data, strlen(opt)) != 0)
617                 return 0;
618         else
619                 return fl;
620 }
621
622 /* non-client-specific mount options are parsed in lmd_parse */
623 static int ll_options(char *options, int *flags)
624 {
625         int tmp;
626         char *s1 = options, *s2;
627
628         if (!options)
629                 return 0;
630
631         CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
632
633         while (*s1) {
634                 CDEBUG(D_SUPER, "next opt=%s\n", s1);
635                 tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
636                 if (tmp) {
637                         *flags |= tmp;
638                         goto next;
639                 }
640                 tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
641                 if (tmp) {
642                         *flags |= tmp;
643                         goto next;
644                 }
645                 tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
646                 if (tmp) {
647                         *flags |= tmp;
648                         goto next;
649                 }
650                 tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
651                 if (tmp) {
652                         *flags &= ~tmp;
653                         goto next;
654                 }
655                 tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
656                 if (tmp) {
657                         *flags |= tmp;
658                         goto next;
659                 }
660                 tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
661                 if (tmp) {
662                         *flags &= ~tmp;
663                         goto next;
664                 }
665                 tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
666                 if (tmp) {
667                         *flags |= tmp;
668                         goto next;
669                 }
670                 tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
671                 if (tmp) {
672                         *flags &= ~tmp;
673                         goto next;
674                 }
675
676                 tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
677                 if (tmp) {
678                         *flags |= tmp;
679                         goto next;
680                 }
681                 tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
682                 if (tmp) {
683                         *flags &= ~tmp;
684                         goto next;
685                 }
686                 tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
687                 if (tmp) {
688                         *flags |= tmp;
689                         goto next;
690                 }
691                 tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
692                 if (tmp) {
693                         *flags &= ~tmp;
694                         goto next;
695                 }
696                 tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
697                 if (tmp) {
698                         *flags |= tmp;
699                         goto next;
700                 }
701                 tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
702                 if (tmp) {
703                         *flags &= ~tmp;
704                         goto next;
705                 }
706                 tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW);
707                 if (tmp) {
708                         *flags |= tmp;
709                         goto next;
710                 }
711                 tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
712                 if (tmp) {
713                         *flags |= tmp;
714                         goto next;
715                 }
716                 tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
717                 if (tmp) {
718                         *flags |= tmp;
719                         goto next;
720                 }
721                 tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
722                 if (tmp) {
723                         *flags &= ~tmp;
724                         goto next;
725                 }
726                 LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
727                                    s1);
728                 return -EINVAL;
729
730 next:
731                 /* Find next opt */
732                 s2 = strchr(s1, ',');
733                 if (!s2)
734                         break;
735                 s1 = s2 + 1;
736         }
737         return 0;
738 }
739
740 void ll_lli_init(struct ll_inode_info *lli)
741 {
742         lli->lli_inode_magic = LLI_INODE_MAGIC;
743         lli->lli_flags = 0;
744         lli->lli_ioepoch = 0;
745         lli->lli_maxbytes = MAX_LFS_FILESIZE;
746         spin_lock_init(&lli->lli_lock);
747         lli->lli_posix_acl = NULL;
748         /* Do not set lli_fid, it has been initialized already. */
749         fid_zero(&lli->lli_pfid);
750         INIT_LIST_HEAD(&lli->lli_close_list);
751         lli->lli_pending_och = NULL;
752         lli->lli_mds_read_och = NULL;
753         lli->lli_mds_write_och = NULL;
754         lli->lli_mds_exec_och = NULL;
755         lli->lli_open_fd_read_count = 0;
756         lli->lli_open_fd_write_count = 0;
757         lli->lli_open_fd_exec_count = 0;
758         mutex_init(&lli->lli_och_mutex);
759         spin_lock_init(&lli->lli_agl_lock);
760         lli->lli_has_smd = false;
761         spin_lock_init(&lli->lli_layout_lock);
762         ll_layout_version_set(lli, LL_LAYOUT_GEN_NONE);
763         lli->lli_clob = NULL;
764
765         init_rwsem(&lli->lli_xattrs_list_rwsem);
766         mutex_init(&lli->lli_xattrs_enq_lock);
767
768         LASSERT(lli->lli_vfs_inode.i_mode != 0);
769         if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
770                 mutex_init(&lli->lli_readdir_mutex);
771                 lli->lli_opendir_key = NULL;
772                 lli->lli_sai = NULL;
773                 spin_lock_init(&lli->lli_sa_lock);
774                 lli->lli_opendir_pid = 0;
775         } else {
776                 mutex_init(&lli->lli_size_mutex);
777                 lli->lli_symlink_name = NULL;
778                 init_rwsem(&lli->lli_trunc_sem);
779                 mutex_init(&lli->lli_write_mutex);
780                 init_rwsem(&lli->lli_glimpse_sem);
781                 lli->lli_glimpse_time = 0;
782                 INIT_LIST_HEAD(&lli->lli_agl_list);
783                 lli->lli_agl_index = 0;
784                 lli->lli_async_rc = 0;
785         }
786         mutex_init(&lli->lli_layout_mutex);
787 }
788
789 static inline int ll_bdi_register(struct backing_dev_info *bdi)
790 {
791         static atomic_t ll_bdi_num = ATOMIC_INIT(0);
792
793         bdi->name = "lustre";
794         return bdi_register(bdi, NULL, "lustre-%d",
795                             atomic_inc_return(&ll_bdi_num));
796 }
797
798 int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
799 {
800         struct lustre_profile *lprof = NULL;
801         struct lustre_sb_info *lsi = s2lsi(sb);
802         struct ll_sb_info *sbi;
803         char  *dt = NULL, *md = NULL;
804         char  *profilenm = get_profile_name(sb);
805         struct config_llog_instance *cfg;
806         int    err;
807
808         CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
809
810         cfg = kzalloc(sizeof(*cfg), GFP_NOFS);
811         if (!cfg)
812                 return -ENOMEM;
813
814         try_module_get(THIS_MODULE);
815
816         /* client additional sb info */
817         sbi = ll_init_sbi(sb);
818         lsi->lsi_llsbi = sbi;
819         if (!sbi) {
820                 module_put(THIS_MODULE);
821                 kfree(cfg);
822                 return -ENOMEM;
823         }
824
825         err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
826         if (err)
827                 goto out_free;
828
829         err = bdi_init(&lsi->lsi_bdi);
830         if (err)
831                 goto out_free;
832         lsi->lsi_flags |= LSI_BDI_INITIALIZED;
833         lsi->lsi_bdi.capabilities = 0;
834         err = ll_bdi_register(&lsi->lsi_bdi);
835         if (err)
836                 goto out_free;
837
838         sb->s_bdi = &lsi->lsi_bdi;
839         /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
840         sb->s_d_op = &ll_d_ops;
841
842         /* Generate a string unique to this super, in case some joker tries
843          * to mount the same fs at two mount points.
844          * Use the address of the super itself.
845          */
846         cfg->cfg_instance = sb;
847         cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
848         cfg->cfg_callback = class_config_llog_handler;
849         /* set up client obds */
850         err = lustre_process_log(sb, profilenm, cfg);
851         if (err < 0)
852                 goto out_free;
853
854         /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
855         lprof = class_get_profile(profilenm);
856         if (!lprof) {
857                 LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be read from the MGS.  Does that filesystem exist?\n",
858                                    profilenm);
859                 err = -EINVAL;
860                 goto out_free;
861         }
862         CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
863                lprof->lp_md, lprof->lp_dt);
864
865         dt = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
866         if (!dt) {
867                 err = -ENOMEM;
868                 goto out_free;
869         }
870
871         md = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_md, cfg->cfg_instance);
872         if (!md) {
873                 err = -ENOMEM;
874                 goto out_free;
875         }
876
877         /* connections, registrations, sb setup */
878         err = client_common_fill_super(sb, md, dt, mnt);
879
880 out_free:
881         kfree(md);
882         kfree(dt);
883         if (err)
884                 ll_put_super(sb);
885         else if (sbi->ll_flags & LL_SBI_VERBOSE)
886                 LCONSOLE_WARN("Mounted %s\n", profilenm);
887
888         kfree(cfg);
889         return err;
890 } /* ll_fill_super */
891
892 void ll_put_super(struct super_block *sb)
893 {
894         struct config_llog_instance cfg, params_cfg;
895         struct obd_device *obd;
896         struct lustre_sb_info *lsi = s2lsi(sb);
897         struct ll_sb_info *sbi = ll_s2sbi(sb);
898         char *profilenm = get_profile_name(sb);
899         int ccc_count, next, force = 1, rc = 0;
900
901         CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
902
903         cfg.cfg_instance = sb;
904         lustre_end_log(sb, profilenm, &cfg);
905
906         params_cfg.cfg_instance = sb;
907         lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
908
909         if (sbi->ll_md_exp) {
910                 obd = class_exp2obd(sbi->ll_md_exp);
911                 if (obd)
912                         force = obd->obd_force;
913         }
914
915         /* Wait for unstable pages to be committed to stable storage */
916         if (!force) {
917                 struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
918
919                 rc = l_wait_event(sbi->ll_cache->ccc_unstable_waitq,
920                                   !atomic_read(&sbi->ll_cache->ccc_unstable_nr),
921                                   &lwi);
922         }
923
924         ccc_count = atomic_read(&sbi->ll_cache->ccc_unstable_nr);
925         if (!force && rc != -EINTR)
926                 LASSERTF(!ccc_count, "count: %i\n", ccc_count);
927
928         /* We need to set force before the lov_disconnect in
929          * lustre_common_put_super, since l_d cleans up osc's as well.
930          */
931         if (force) {
932                 next = 0;
933                 while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
934                                                      &next)) != NULL) {
935                         obd->obd_force = force;
936                 }
937         }
938
939         if (sbi->ll_lcq) {
940                 /* Only if client_common_fill_super succeeded */
941                 client_common_put_super(sb);
942         }
943
944         next = 0;
945         while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)))
946                 class_manual_cleanup(obd);
947
948         if (sbi->ll_flags & LL_SBI_VERBOSE)
949                 LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
950
951         if (profilenm)
952                 class_del_profile(profilenm);
953
954         if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
955                 bdi_destroy(&lsi->lsi_bdi);
956                 lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
957         }
958
959         ll_free_sbi(sb);
960         lsi->lsi_llsbi = NULL;
961
962         lustre_common_put_super(sb);
963
964         cl_env_cache_purge(~0);
965
966         module_put(THIS_MODULE);
967 } /* client_put_super */
968
969 struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
970 {
971         struct inode *inode = NULL;
972
973         /* NOTE: we depend on atomic igrab() -bzzz */
974         lock_res_and_lock(lock);
975         if (lock->l_resource->lr_lvb_inode) {
976                 struct ll_inode_info *lli;
977
978                 lli = ll_i2info(lock->l_resource->lr_lvb_inode);
979                 if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
980                         inode = igrab(lock->l_resource->lr_lvb_inode);
981                 } else {
982                         inode = lock->l_resource->lr_lvb_inode;
983                         LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
984                                          D_WARNING, lock, "lr_lvb_inode %p is bogus: magic %08x",
985                                          lock->l_resource->lr_lvb_inode,
986                                          lli->lli_inode_magic);
987                         inode = NULL;
988                 }
989         }
990         unlock_res_and_lock(lock);
991         return inode;
992 }
993
994 void ll_clear_inode(struct inode *inode)
995 {
996         struct ll_inode_info *lli = ll_i2info(inode);
997         struct ll_sb_info *sbi = ll_i2sbi(inode);
998
999         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
1000                PFID(ll_inode2fid(inode)), inode);
1001
1002         if (S_ISDIR(inode->i_mode)) {
1003                 /* these should have been cleared in ll_file_release */
1004                 LASSERT(!lli->lli_opendir_key);
1005                 LASSERT(!lli->lli_sai);
1006                 LASSERT(lli->lli_opendir_pid == 0);
1007         }
1008
1009         spin_lock(&lli->lli_lock);
1010         ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
1011         spin_unlock(&lli->lli_lock);
1012         md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
1013
1014         LASSERT(!lli->lli_open_fd_write_count);
1015         LASSERT(!lli->lli_open_fd_read_count);
1016         LASSERT(!lli->lli_open_fd_exec_count);
1017
1018         if (lli->lli_mds_write_och)
1019                 ll_md_real_close(inode, FMODE_WRITE);
1020         if (lli->lli_mds_exec_och)
1021                 ll_md_real_close(inode, FMODE_EXEC);
1022         if (lli->lli_mds_read_och)
1023                 ll_md_real_close(inode, FMODE_READ);
1024
1025         if (S_ISLNK(inode->i_mode)) {
1026                 kfree(lli->lli_symlink_name);
1027                 lli->lli_symlink_name = NULL;
1028         }
1029
1030         ll_xattr_cache_destroy(inode);
1031
1032 #ifdef CONFIG_FS_POSIX_ACL
1033         if (lli->lli_posix_acl) {
1034                 LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
1035                 posix_acl_release(lli->lli_posix_acl);
1036                 lli->lli_posix_acl = NULL;
1037         }
1038 #endif
1039         lli->lli_inode_magic = LLI_INODE_DEAD;
1040
1041         if (!S_ISDIR(inode->i_mode))
1042                 LASSERT(list_empty(&lli->lli_agl_list));
1043
1044         /*
1045          * XXX This has to be done before lsm is freed below, because
1046          * cl_object still uses inode lsm.
1047          */
1048         cl_inode_fini(inode);
1049         lli->lli_has_smd = false;
1050 }
1051
1052 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
1053
1054 static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
1055                          struct md_open_data **mod)
1056 {
1057         struct lustre_md md;
1058         struct inode *inode = d_inode(dentry);
1059         struct ll_sb_info *sbi = ll_i2sbi(inode);
1060         struct ptlrpc_request *request = NULL;
1061         int rc, ia_valid;
1062
1063         op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
1064                                      LUSTRE_OPC_ANY, NULL);
1065         if (IS_ERR(op_data))
1066                 return PTR_ERR(op_data);
1067
1068         rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0,
1069                         &request, mod);
1070         if (rc) {
1071                 ptlrpc_req_finished(request);
1072                 if (rc == -ENOENT) {
1073                         clear_nlink(inode);
1074                         /* Unlinked special device node? Or just a race?
1075                          * Pretend we did everything.
1076                          */
1077                         if (!S_ISREG(inode->i_mode) &&
1078                             !S_ISDIR(inode->i_mode)) {
1079                                 ia_valid = op_data->op_attr.ia_valid;
1080                                 op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
1081                                 rc = simple_setattr(dentry, &op_data->op_attr);
1082                                 op_data->op_attr.ia_valid = ia_valid;
1083                         }
1084                 } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
1085                         CERROR("md_setattr fails: rc = %d\n", rc);
1086                 }
1087                 return rc;
1088         }
1089
1090         rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
1091                               sbi->ll_md_exp, &md);
1092         if (rc) {
1093                 ptlrpc_req_finished(request);
1094                 return rc;
1095         }
1096
1097         ia_valid = op_data->op_attr.ia_valid;
1098         /* inode size will be in cl_setattr_ost, can't do it now since dirty
1099          * cache is not cleared yet.
1100          */
1101         op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
1102         rc = simple_setattr(dentry, &op_data->op_attr);
1103         op_data->op_attr.ia_valid = ia_valid;
1104
1105         /* Extract epoch data if obtained. */
1106         op_data->op_handle = md.body->handle;
1107         op_data->op_ioepoch = md.body->ioepoch;
1108
1109         ll_update_inode(inode, &md);
1110         ptlrpc_req_finished(request);
1111
1112         return rc;
1113 }
1114
1115 /* Close IO epoch and send Size-on-MDS attribute update. */
1116 static int ll_setattr_done_writing(struct inode *inode,
1117                                    struct md_op_data *op_data,
1118                                    struct md_open_data *mod)
1119 {
1120         struct ll_inode_info *lli = ll_i2info(inode);
1121         int rc = 0;
1122
1123         if (!S_ISREG(inode->i_mode))
1124                 return 0;
1125
1126         CDEBUG(D_INODE, "Epoch %llu closed on "DFID" for truncate\n",
1127                op_data->op_ioepoch, PFID(&lli->lli_fid));
1128
1129         op_data->op_flags = MF_EPOCH_CLOSE;
1130         ll_done_writing_attr(inode, op_data);
1131         ll_pack_inode2opdata(inode, op_data, NULL);
1132
1133         rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod);
1134         if (rc == -EAGAIN)
1135                 /* MDS has instructed us to obtain Size-on-MDS attribute
1136                  * from OSTs and send setattr to back to MDS.
1137                  */
1138                 rc = ll_som_update(inode, op_data);
1139         else if (rc) {
1140                 CERROR("%s: inode "DFID" mdc truncate failed: rc = %d\n",
1141                       ll_i2sbi(inode)->ll_md_exp->exp_obd->obd_name,
1142                       PFID(ll_inode2fid(inode)), rc);
1143         }
1144         return rc;
1145 }
1146
1147 /* If this inode has objects allocated to it (lsm != NULL), then the OST
1148  * object(s) determine the file size and mtime.  Otherwise, the MDS will
1149  * keep these values until such a time that objects are allocated for it.
1150  * We do the MDS operations first, as it is checking permissions for us.
1151  * We don't to the MDS RPC if there is nothing that we want to store there,
1152  * otherwise there is no harm in updating mtime/atime on the MDS if we are
1153  * going to do an RPC anyways.
1154  *
1155  * If we are doing a truncate, we will send the mtime and ctime updates
1156  * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
1157  * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
1158  * at the same time.
1159  *
1160  * In case of HSMimport, we only set attr on MDS.
1161  */
1162 int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
1163 {
1164         struct inode *inode = d_inode(dentry);
1165         struct ll_inode_info *lli = ll_i2info(inode);
1166         struct md_op_data *op_data = NULL;
1167         struct md_open_data *mod = NULL;
1168         bool file_is_released = false;
1169         int rc = 0, rc1 = 0;
1170
1171         CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, valid %x, hsm_import %d\n",
1172                ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), inode,
1173                i_size_read(inode), attr->ia_size, attr->ia_valid, hsm_import);
1174
1175         if (attr->ia_valid & ATTR_SIZE) {
1176                 /* Check new size against VFS/VM file size limit and rlimit */
1177                 rc = inode_newsize_ok(inode, attr->ia_size);
1178                 if (rc)
1179                         return rc;
1180
1181                 /* The maximum Lustre file size is variable, based on the
1182                  * OST maximum object size and number of stripes.  This
1183                  * needs another check in addition to the VFS check above.
1184                  */
1185                 if (attr->ia_size > ll_file_maxbytes(inode)) {
1186                         CDEBUG(D_INODE, "file "DFID" too large %llu > %llu\n",
1187                                PFID(&lli->lli_fid), attr->ia_size,
1188                                ll_file_maxbytes(inode));
1189                         return -EFBIG;
1190                 }
1191
1192                 attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
1193         }
1194
1195         /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
1196         if (attr->ia_valid & TIMES_SET_FLAGS) {
1197                 if ((!uid_eq(current_fsuid(), inode->i_uid)) &&
1198                     !capable(CFS_CAP_FOWNER))
1199                         return -EPERM;
1200         }
1201
1202         /* We mark all of the fields "set" so MDS/OST does not re-set them */
1203         if (attr->ia_valid & ATTR_CTIME) {
1204                 attr->ia_ctime = CURRENT_TIME;
1205                 attr->ia_valid |= ATTR_CTIME_SET;
1206         }
1207         if (!(attr->ia_valid & ATTR_ATIME_SET) &&
1208             (attr->ia_valid & ATTR_ATIME)) {
1209                 attr->ia_atime = CURRENT_TIME;
1210                 attr->ia_valid |= ATTR_ATIME_SET;
1211         }
1212         if (!(attr->ia_valid & ATTR_MTIME_SET) &&
1213             (attr->ia_valid & ATTR_MTIME)) {
1214                 attr->ia_mtime = CURRENT_TIME;
1215                 attr->ia_valid |= ATTR_MTIME_SET;
1216         }
1217
1218         if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
1219                 CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %llu\n",
1220                        LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
1221                        (s64)ktime_get_real_seconds());
1222
1223         /* We always do an MDS RPC, even if we're only changing the size;
1224          * only the MDS knows whether truncate() should fail with -ETXTBUSY
1225          */
1226
1227         op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
1228         if (!op_data)
1229                 return -ENOMEM;
1230
1231         if (!S_ISDIR(inode->i_mode))
1232                 inode_unlock(inode);
1233
1234         /* truncate on a released file must failed with -ENODATA,
1235          * so size must not be set on MDS for released file
1236          * but other attributes must be set
1237          */
1238         if (S_ISREG(inode->i_mode)) {
1239                 struct lov_stripe_md *lsm;
1240                 __u32 gen;
1241
1242                 ll_layout_refresh(inode, &gen);
1243                 lsm = ccc_inode_lsm_get(inode);
1244                 if (lsm && lsm->lsm_pattern & LOV_PATTERN_F_RELEASED)
1245                         file_is_released = true;
1246                 ccc_inode_lsm_put(inode, lsm);
1247
1248                 if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
1249                         if (file_is_released) {
1250                                 rc = ll_layout_restore(inode, 0, attr->ia_size);
1251                                 if (rc < 0)
1252                                         goto out;
1253
1254                                 file_is_released = false;
1255                                 ll_layout_refresh(inode, &gen);
1256                         }
1257
1258                         /*
1259                          * If we are changing file size, file content is
1260                          * modified, flag it.
1261                          */
1262                         attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
1263                         spin_lock(&lli->lli_lock);
1264                         lli->lli_flags |= LLIF_DATA_MODIFIED;
1265                         spin_unlock(&lli->lli_lock);
1266                         op_data->op_bias |= MDS_DATA_MODIFIED;
1267                 }
1268         }
1269
1270         memcpy(&op_data->op_attr, attr, sizeof(*attr));
1271
1272         /* Open epoch for truncate. */
1273         if (exp_connect_som(ll_i2mdexp(inode)) && !hsm_import &&
1274             (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
1275                 op_data->op_flags = MF_EPOCH_OPEN;
1276
1277         rc = ll_md_setattr(dentry, op_data, &mod);
1278         if (rc)
1279                 goto out;
1280
1281         /* RPC to MDT is sent, cancel data modification flag */
1282         if (op_data->op_bias & MDS_DATA_MODIFIED) {
1283                 spin_lock(&lli->lli_lock);
1284                 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
1285                 spin_unlock(&lli->lli_lock);
1286         }
1287
1288         ll_ioepoch_open(lli, op_data->op_ioepoch);
1289         if (!S_ISREG(inode->i_mode) || file_is_released) {
1290                 rc = 0;
1291                 goto out;
1292         }
1293
1294         if (attr->ia_valid & (ATTR_SIZE |
1295                               ATTR_ATIME | ATTR_ATIME_SET |
1296                               ATTR_MTIME | ATTR_MTIME_SET)) {
1297                 /* For truncate and utimes sending attributes to OSTs, setting
1298                  * mtime/atime to the past will be performed under PW [0:EOF]
1299                  * extent lock (new_size:EOF for truncate).  It may seem
1300                  * excessive to send mtime/atime updates to OSTs when not
1301                  * setting times to past, but it is necessary due to possible
1302                  * time de-synchronization between MDT inode and OST objects
1303                  */
1304                 if (attr->ia_valid & ATTR_SIZE)
1305                         down_write(&lli->lli_trunc_sem);
1306                 rc = cl_setattr_ost(inode, attr);
1307                 if (attr->ia_valid & ATTR_SIZE)
1308                         up_write(&lli->lli_trunc_sem);
1309         }
1310 out:
1311         if (op_data->op_ioepoch) {
1312                 rc1 = ll_setattr_done_writing(inode, op_data, mod);
1313                 if (!rc)
1314                         rc = rc1;
1315         }
1316         ll_finish_md_op_data(op_data);
1317
1318         if (!S_ISDIR(inode->i_mode)) {
1319                 inode_lock(inode);
1320                 if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
1321                         inode_dio_wait(inode);
1322         }
1323
1324         ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
1325                         LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
1326
1327         return rc;
1328 }
1329
1330 int ll_setattr(struct dentry *de, struct iattr *attr)
1331 {
1332         int mode = d_inode(de)->i_mode;
1333
1334         if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
1335                               (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
1336                 attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
1337
1338         if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
1339                                (ATTR_SIZE|ATTR_MODE)) &&
1340             (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
1341              (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
1342               !(attr->ia_mode & S_ISGID))))
1343                 attr->ia_valid |= ATTR_FORCE;
1344
1345         if ((attr->ia_valid & ATTR_MODE) &&
1346             (mode & S_ISUID) &&
1347             !(attr->ia_mode & S_ISUID) &&
1348             !(attr->ia_valid & ATTR_KILL_SUID))
1349                 attr->ia_valid |= ATTR_KILL_SUID;
1350
1351         if ((attr->ia_valid & ATTR_MODE) &&
1352             ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
1353             !(attr->ia_mode & S_ISGID) &&
1354             !(attr->ia_valid & ATTR_KILL_SGID))
1355                 attr->ia_valid |= ATTR_KILL_SGID;
1356
1357         return ll_setattr_raw(de, attr, false);
1358 }
1359
1360 int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
1361                        __u64 max_age, __u32 flags)
1362 {
1363         struct ll_sb_info *sbi = ll_s2sbi(sb);
1364         struct obd_statfs obd_osfs;
1365         int rc;
1366
1367         rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
1368         if (rc) {
1369                 CERROR("md_statfs fails: rc = %d\n", rc);
1370                 return rc;
1371         }
1372
1373         osfs->os_type = sb->s_magic;
1374
1375         CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
1376                osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,
1377                osfs->os_files);
1378
1379         if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
1380                 flags |= OBD_STATFS_NODELAY;
1381
1382         rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
1383         if (rc) {
1384                 CERROR("obd_statfs fails: rc = %d\n", rc);
1385                 return rc;
1386         }
1387
1388         CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
1389                obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
1390                obd_osfs.os_files);
1391
1392         osfs->os_bsize = obd_osfs.os_bsize;
1393         osfs->os_blocks = obd_osfs.os_blocks;
1394         osfs->os_bfree = obd_osfs.os_bfree;
1395         osfs->os_bavail = obd_osfs.os_bavail;
1396
1397         /* If we don't have as many objects free on the OST as inodes
1398          * on the MDS, we reduce the total number of inodes to
1399          * compensate, so that the "inodes in use" number is correct.
1400          */
1401         if (obd_osfs.os_ffree < osfs->os_ffree) {
1402                 osfs->os_files = (osfs->os_files - osfs->os_ffree) +
1403                         obd_osfs.os_ffree;
1404                 osfs->os_ffree = obd_osfs.os_ffree;
1405         }
1406
1407         return rc;
1408 }
1409
1410 int ll_statfs(struct dentry *de, struct kstatfs *sfs)
1411 {
1412         struct super_block *sb = de->d_sb;
1413         struct obd_statfs osfs;
1414         int rc;
1415
1416         CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64());
1417         ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
1418
1419         /* Some amount of caching on the client is allowed */
1420         rc = ll_statfs_internal(sb, &osfs,
1421                                 cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
1422                                 0);
1423         if (rc)
1424                 return rc;
1425
1426         statfs_unpack(sfs, &osfs);
1427
1428         /* We need to downshift for all 32-bit kernels, because we can't
1429          * tell if the kernel is being called via sys_statfs64() or not.
1430          * Stop before overflowing f_bsize - in which case it is better
1431          * to just risk EOVERFLOW if caller is using old sys_statfs().
1432          */
1433         if (sizeof(long) < 8) {
1434                 while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
1435                         sfs->f_bsize <<= 1;
1436
1437                         osfs.os_blocks >>= 1;
1438                         osfs.os_bfree >>= 1;
1439                         osfs.os_bavail >>= 1;
1440                 }
1441         }
1442
1443         sfs->f_blocks = osfs.os_blocks;
1444         sfs->f_bfree = osfs.os_bfree;
1445         sfs->f_bavail = osfs.os_bavail;
1446         sfs->f_fsid = ll_s2sbi(sb)->ll_fsid;
1447         return 0;
1448 }
1449
1450 void ll_inode_size_lock(struct inode *inode)
1451 {
1452         struct ll_inode_info *lli;
1453
1454         LASSERT(!S_ISDIR(inode->i_mode));
1455
1456         lli = ll_i2info(inode);
1457         mutex_lock(&lli->lli_size_mutex);
1458 }
1459
1460 void ll_inode_size_unlock(struct inode *inode)
1461 {
1462         struct ll_inode_info *lli;
1463
1464         lli = ll_i2info(inode);
1465         mutex_unlock(&lli->lli_size_mutex);
1466 }
1467
1468 void ll_update_inode(struct inode *inode, struct lustre_md *md)
1469 {
1470         struct ll_inode_info *lli = ll_i2info(inode);
1471         struct mdt_body *body = md->body;
1472         struct lov_stripe_md *lsm = md->lsm;
1473         struct ll_sb_info *sbi = ll_i2sbi(inode);
1474
1475         LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
1476         if (lsm) {
1477                 if (!lli->lli_has_smd &&
1478                     !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
1479                         cl_file_inode_init(inode, md);
1480
1481                 lli->lli_maxbytes = lsm->lsm_maxbytes;
1482                 if (lli->lli_maxbytes > MAX_LFS_FILESIZE)
1483                         lli->lli_maxbytes = MAX_LFS_FILESIZE;
1484         }
1485
1486 #ifdef CONFIG_FS_POSIX_ACL
1487         if (body->valid & OBD_MD_FLACL) {
1488                 spin_lock(&lli->lli_lock);
1489                 if (lli->lli_posix_acl)
1490                         posix_acl_release(lli->lli_posix_acl);
1491                 lli->lli_posix_acl = md->posix_acl;
1492                 spin_unlock(&lli->lli_lock);
1493         }
1494 #endif
1495         inode->i_ino = cl_fid_build_ino(&body->fid1,
1496                                         sbi->ll_flags & LL_SBI_32BIT_API);
1497         inode->i_generation = cl_fid_build_gen(&body->fid1);
1498
1499         if (body->valid & OBD_MD_FLATIME) {
1500                 if (body->atime > LTIME_S(inode->i_atime))
1501                         LTIME_S(inode->i_atime) = body->atime;
1502                 lli->lli_atime = body->atime;
1503         }
1504         if (body->valid & OBD_MD_FLMTIME) {
1505                 if (body->mtime > LTIME_S(inode->i_mtime)) {
1506                         CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n",
1507                                inode->i_ino, LTIME_S(inode->i_mtime),
1508                                body->mtime);
1509                         LTIME_S(inode->i_mtime) = body->mtime;
1510                 }
1511                 lli->lli_mtime = body->mtime;
1512         }
1513         if (body->valid & OBD_MD_FLCTIME) {
1514                 if (body->ctime > LTIME_S(inode->i_ctime))
1515                         LTIME_S(inode->i_ctime) = body->ctime;
1516                 lli->lli_ctime = body->ctime;
1517         }
1518         if (body->valid & OBD_MD_FLMODE)
1519                 inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
1520         if (body->valid & OBD_MD_FLTYPE)
1521                 inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
1522         LASSERT(inode->i_mode != 0);
1523         if (S_ISREG(inode->i_mode))
1524                 inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1,
1525                                        LL_MAX_BLKSIZE_BITS);
1526         else
1527                 inode->i_blkbits = inode->i_sb->s_blocksize_bits;
1528         if (body->valid & OBD_MD_FLUID)
1529                 inode->i_uid = make_kuid(&init_user_ns, body->uid);
1530         if (body->valid & OBD_MD_FLGID)
1531                 inode->i_gid = make_kgid(&init_user_ns, body->gid);
1532         if (body->valid & OBD_MD_FLFLAGS)
1533                 inode->i_flags = ll_ext_to_inode_flags(body->flags);
1534         if (body->valid & OBD_MD_FLNLINK)
1535                 set_nlink(inode, body->nlink);
1536         if (body->valid & OBD_MD_FLRDEV)
1537                 inode->i_rdev = old_decode_dev(body->rdev);
1538
1539         if (body->valid & OBD_MD_FLID) {
1540                 /* FID shouldn't be changed! */
1541                 if (fid_is_sane(&lli->lli_fid)) {
1542                         LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1),
1543                                  "Trying to change FID "DFID" to the "DFID", inode "DFID"(%p)\n",
1544                                  PFID(&lli->lli_fid), PFID(&body->fid1),
1545                                  PFID(ll_inode2fid(inode)), inode);
1546                 } else {
1547                         lli->lli_fid = body->fid1;
1548                 }
1549         }
1550
1551         LASSERT(fid_seq(&lli->lli_fid) != 0);
1552
1553         if (body->valid & OBD_MD_FLSIZE) {
1554                 if (exp_connect_som(ll_i2mdexp(inode)) &&
1555                     S_ISREG(inode->i_mode)) {
1556                         struct lustre_handle lockh;
1557                         enum ldlm_mode mode;
1558
1559                         /* As it is possible a blocking ast has been processed
1560                          * by this time, we need to check there is an UPDATE
1561                          * lock on the client and set LLIF_MDS_SIZE_LOCK holding
1562                          * it.
1563                          */
1564                         mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE,
1565                                                &lockh, LDLM_FL_CBPENDING,
1566                                                LCK_CR | LCK_CW |
1567                                                LCK_PR | LCK_PW);
1568                         if (mode) {
1569                                 if (lli->lli_flags & (LLIF_DONE_WRITING |
1570                                                       LLIF_EPOCH_PENDING |
1571                                                       LLIF_SOM_DIRTY)) {
1572                                         CERROR("%s: inode "DFID" flags %u still has size authority! do not trust the size got from MDS\n",
1573                                                sbi->ll_md_exp->exp_obd->obd_name,
1574                                                PFID(ll_inode2fid(inode)),
1575                                                lli->lli_flags);
1576                                 } else {
1577                                         /* Use old size assignment to avoid
1578                                          * deadlock bz14138 & bz14326
1579                                          */
1580                                         i_size_write(inode, body->size);
1581                                         spin_lock(&lli->lli_lock);
1582                                         lli->lli_flags |= LLIF_MDS_SIZE_LOCK;
1583                                         spin_unlock(&lli->lli_lock);
1584                                 }
1585                                 ldlm_lock_decref(&lockh, mode);
1586                         }
1587                 } else {
1588                         /* Use old size assignment to avoid
1589                          * deadlock bz14138 & bz14326
1590                          */
1591                         i_size_write(inode, body->size);
1592
1593                         CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n",
1594                                inode->i_ino, (unsigned long long)body->size);
1595                 }
1596
1597                 if (body->valid & OBD_MD_FLBLOCKS)
1598                         inode->i_blocks = body->blocks;
1599         }
1600
1601         if (body->valid & OBD_MD_TSTATE) {
1602                 if (body->t_state & MS_RESTORE)
1603                         lli->lli_flags |= LLIF_FILE_RESTORING;
1604         }
1605 }
1606
1607 void ll_read_inode2(struct inode *inode, void *opaque)
1608 {
1609         struct lustre_md *md = opaque;
1610         struct ll_inode_info *lli = ll_i2info(inode);
1611
1612         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
1613                PFID(&lli->lli_fid), inode);
1614
1615         LASSERT(!lli->lli_has_smd);
1616
1617         /* Core attributes from the MDS first.  This is a new inode, and
1618          * the VFS doesn't zero times in the core inode so we have to do
1619          * it ourselves.  They will be overwritten by either MDS or OST
1620          * attributes - we just need to make sure they aren't newer.
1621          */
1622         LTIME_S(inode->i_mtime) = 0;
1623         LTIME_S(inode->i_atime) = 0;
1624         LTIME_S(inode->i_ctime) = 0;
1625         inode->i_rdev = 0;
1626         ll_update_inode(inode, md);
1627
1628         /* OIDEBUG(inode); */
1629
1630         if (S_ISREG(inode->i_mode)) {
1631                 struct ll_sb_info *sbi = ll_i2sbi(inode);
1632
1633                 inode->i_op = &ll_file_inode_operations;
1634                 inode->i_fop = sbi->ll_fop;
1635                 inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
1636         } else if (S_ISDIR(inode->i_mode)) {
1637                 inode->i_op = &ll_dir_inode_operations;
1638                 inode->i_fop = &ll_dir_operations;
1639         } else if (S_ISLNK(inode->i_mode)) {
1640                 inode->i_op = &ll_fast_symlink_inode_operations;
1641         } else {
1642                 inode->i_op = &ll_special_inode_operations;
1643
1644                 init_special_inode(inode, inode->i_mode,
1645                                    inode->i_rdev);
1646         }
1647 }
1648
1649 void ll_delete_inode(struct inode *inode)
1650 {
1651         struct ll_inode_info *lli = ll_i2info(inode);
1652
1653         if (S_ISREG(inode->i_mode) && lli->lli_clob)
1654                 /* discard all dirty pages before truncating them, required by
1655                  * osc_extent implementation at LU-1030.
1656                  */
1657                 cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
1658                                    CL_FSYNC_DISCARD, 1);
1659
1660         truncate_inode_pages_final(&inode->i_data);
1661
1662         /* Workaround for LU-118 */
1663         if (inode->i_data.nrpages) {
1664                 spin_lock_irq(&inode->i_data.tree_lock);
1665                 spin_unlock_irq(&inode->i_data.tree_lock);
1666                 LASSERTF(inode->i_data.nrpages == 0,
1667                          "inode="DFID"(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n",
1668                          PFID(ll_inode2fid(inode)), inode,
1669                          inode->i_data.nrpages);
1670         }
1671         /* Workaround end */
1672
1673         ll_clear_inode(inode);
1674         clear_inode(inode);
1675 }
1676
1677 int ll_iocontrol(struct inode *inode, struct file *file,
1678                  unsigned int cmd, unsigned long arg)
1679 {
1680         struct ll_sb_info *sbi = ll_i2sbi(inode);
1681         struct ptlrpc_request *req = NULL;
1682         int rc, flags = 0;
1683
1684         switch (cmd) {
1685         case FSFILT_IOC_GETFLAGS: {
1686                 struct mdt_body *body;
1687                 struct md_op_data *op_data;
1688
1689                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
1690                                              0, 0, LUSTRE_OPC_ANY,
1691                                              NULL);
1692                 if (IS_ERR(op_data))
1693                         return PTR_ERR(op_data);
1694
1695                 op_data->op_valid = OBD_MD_FLFLAGS;
1696                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
1697                 ll_finish_md_op_data(op_data);
1698                 if (rc) {
1699                         CERROR("%s: failure inode "DFID": rc = %d\n",
1700                                sbi->ll_md_exp->exp_obd->obd_name,
1701                                PFID(ll_inode2fid(inode)), rc);
1702                         return -abs(rc);
1703                 }
1704
1705                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1706
1707                 flags = body->flags;
1708
1709                 ptlrpc_req_finished(req);
1710
1711                 return put_user(flags, (int __user *)arg);
1712         }
1713         case FSFILT_IOC_SETFLAGS: {
1714                 struct lov_stripe_md *lsm;
1715                 struct obd_info oinfo = { };
1716                 struct md_op_data *op_data;
1717
1718                 if (get_user(flags, (int __user *)arg))
1719                         return -EFAULT;
1720
1721                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1722                                              LUSTRE_OPC_ANY, NULL);
1723                 if (IS_ERR(op_data))
1724                         return PTR_ERR(op_data);
1725
1726                 op_data->op_attr_flags = flags;
1727                 op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
1728                 rc = md_setattr(sbi->ll_md_exp, op_data,
1729                                 NULL, 0, NULL, 0, &req, NULL);
1730                 ll_finish_md_op_data(op_data);
1731                 ptlrpc_req_finished(req);
1732                 if (rc)
1733                         return rc;
1734
1735                 inode->i_flags = ll_ext_to_inode_flags(flags);
1736
1737                 lsm = ccc_inode_lsm_get(inode);
1738                 if (!lsm_has_objects(lsm)) {
1739                         ccc_inode_lsm_put(inode, lsm);
1740                         return 0;
1741                 }
1742
1743                 oinfo.oi_oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS);
1744                 if (!oinfo.oi_oa) {
1745                         ccc_inode_lsm_put(inode, lsm);
1746                         return -ENOMEM;
1747                 }
1748                 oinfo.oi_md = lsm;
1749                 oinfo.oi_oa->o_oi = lsm->lsm_oi;
1750                 oinfo.oi_oa->o_flags = flags;
1751                 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS |
1752                                        OBD_MD_FLGROUP;
1753                 obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid);
1754                 rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
1755                 kmem_cache_free(obdo_cachep, oinfo.oi_oa);
1756                 ccc_inode_lsm_put(inode, lsm);
1757
1758                 if (rc && rc != -EPERM && rc != -EACCES)
1759                         CERROR("osc_setattr_async fails: rc = %d\n", rc);
1760
1761                 return rc;
1762         }
1763         default:
1764                 return -ENOSYS;
1765         }
1766
1767         return 0;
1768 }
1769
1770 int ll_flush_ctx(struct inode *inode)
1771 {
1772         struct ll_sb_info  *sbi = ll_i2sbi(inode);
1773
1774         CDEBUG(D_SEC, "flush context for user %d\n",
1775                from_kuid(&init_user_ns, current_uid()));
1776
1777         obd_set_info_async(NULL, sbi->ll_md_exp,
1778                            sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
1779                            0, NULL, NULL);
1780         obd_set_info_async(NULL, sbi->ll_dt_exp,
1781                            sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
1782                            0, NULL, NULL);
1783         return 0;
1784 }
1785
1786 /* umount -f client means force down, don't save state */
1787 void ll_umount_begin(struct super_block *sb)
1788 {
1789         struct ll_sb_info *sbi = ll_s2sbi(sb);
1790         struct obd_device *obd;
1791         struct obd_ioctl_data *ioc_data;
1792
1793         CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
1794                sb->s_count, atomic_read(&sb->s_active));
1795
1796         obd = class_exp2obd(sbi->ll_md_exp);
1797         if (!obd) {
1798                 CERROR("Invalid MDC connection handle %#llx\n",
1799                        sbi->ll_md_exp->exp_handle.h_cookie);
1800                 return;
1801         }
1802         obd->obd_force = 1;
1803
1804         obd = class_exp2obd(sbi->ll_dt_exp);
1805         if (!obd) {
1806                 CERROR("Invalid LOV connection handle %#llx\n",
1807                        sbi->ll_dt_exp->exp_handle.h_cookie);
1808                 return;
1809         }
1810         obd->obd_force = 1;
1811
1812         ioc_data = kzalloc(sizeof(*ioc_data), GFP_NOFS);
1813         if (ioc_data) {
1814                 obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
1815                               sizeof(*ioc_data), ioc_data, NULL);
1816
1817                 obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
1818                               sizeof(*ioc_data), ioc_data, NULL);
1819
1820                 kfree(ioc_data);
1821         }
1822
1823         /* Really, we'd like to wait until there are no requests outstanding,
1824          * and then continue.  For now, we just invalidate the requests,
1825          * schedule() and sleep one second if needed, and hope.
1826          */
1827         schedule();
1828 }
1829
1830 int ll_remount_fs(struct super_block *sb, int *flags, char *data)
1831 {
1832         struct ll_sb_info *sbi = ll_s2sbi(sb);
1833         char *profilenm = get_profile_name(sb);
1834         int err;
1835         __u32 read_only;
1836
1837         if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
1838                 read_only = *flags & MS_RDONLY;
1839                 err = obd_set_info_async(NULL, sbi->ll_md_exp,
1840                                          sizeof(KEY_READ_ONLY),
1841                                          KEY_READ_ONLY, sizeof(read_only),
1842                                          &read_only, NULL);
1843                 if (err) {
1844                         LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
1845                                       profilenm, read_only ?
1846                                       "read-only" : "read-write", err);
1847                         return err;
1848                 }
1849
1850                 if (read_only)
1851                         sb->s_flags |= MS_RDONLY;
1852                 else
1853                         sb->s_flags &= ~MS_RDONLY;
1854
1855                 if (sbi->ll_flags & LL_SBI_VERBOSE)
1856                         LCONSOLE_WARN("Remounted %s %s\n", profilenm,
1857                                       read_only ?  "read-only" : "read-write");
1858         }
1859         return 0;
1860 }
1861
1862 /**
1863  * Cleanup the open handle that is cached on MDT-side.
1864  *
1865  * For open case, the client side open handling thread may hit error
1866  * after the MDT grant the open. Under such case, the client should
1867  * send close RPC to the MDT as cleanup; otherwise, the open handle
1868  * on the MDT will be leaked there until the client umount or evicted.
1869  *
1870  * In further, if someone unlinked the file, because the open handle
1871  * holds the reference on such file/object, then it will block the
1872  * subsequent threads that want to locate such object via FID.
1873  *
1874  * \param[in] sb        super block for this file-system
1875  * \param[in] open_req  pointer to the original open request
1876  */
1877 void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req)
1878 {
1879         struct mdt_body                 *body;
1880         struct md_op_data               *op_data;
1881         struct ptlrpc_request           *close_req = NULL;
1882         struct obd_export               *exp       = ll_s2sbi(sb)->ll_md_exp;
1883
1884         body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
1885         op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
1886         if (!op_data)
1887                 return;
1888
1889         op_data->op_fid1 = body->fid1;
1890         op_data->op_ioepoch = body->ioepoch;
1891         op_data->op_handle = body->handle;
1892         op_data->op_mod_time = get_seconds();
1893         md_close(exp, op_data, NULL, &close_req);
1894         ptlrpc_req_finished(close_req);
1895         ll_finish_md_op_data(op_data);
1896 }
1897
1898 int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
1899                   struct super_block *sb, struct lookup_intent *it)
1900 {
1901         struct ll_sb_info *sbi = NULL;
1902         struct lustre_md md = { NULL };
1903         int rc;
1904
1905         LASSERT(*inode || sb);
1906         sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
1907         rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
1908                               sbi->ll_md_exp, &md);
1909         if (rc)
1910                 goto cleanup;
1911
1912         if (*inode) {
1913                 ll_update_inode(*inode, &md);
1914         } else {
1915                 LASSERT(sb);
1916
1917                 /*
1918                  * At this point server returns to client's same fid as client
1919                  * generated for creating. So using ->fid1 is okay here.
1920                  */
1921                 if (!fid_is_sane(&md.body->fid1)) {
1922                         CERROR("%s: Fid is insane " DFID "\n",
1923                                ll_get_fsname(sb, NULL, 0),
1924                                PFID(&md.body->fid1));
1925                         rc = -EINVAL;
1926                         goto out;
1927                 }
1928
1929                 *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1,
1930                                              sbi->ll_flags & LL_SBI_32BIT_API),
1931                                  &md);
1932                 if (!*inode) {
1933 #ifdef CONFIG_FS_POSIX_ACL
1934                         if (md.posix_acl) {
1935                                 posix_acl_release(md.posix_acl);
1936                                 md.posix_acl = NULL;
1937                         }
1938 #endif
1939                         rc = -ENOMEM;
1940                         CERROR("new_inode -fatal: rc %d\n", rc);
1941                         goto out;
1942                 }
1943         }
1944
1945         /* Handling piggyback layout lock.
1946          * Layout lock can be piggybacked by getattr and open request.
1947          * The lsm can be applied to inode only if it comes with a layout lock
1948          * otherwise correct layout may be overwritten, for example:
1949          * 1. proc1: mdt returns a lsm but not granting layout
1950          * 2. layout was changed by another client
1951          * 3. proc2: refresh layout and layout lock granted
1952          * 4. proc1: to apply a stale layout
1953          */
1954         if (it && it->it_lock_mode != 0) {
1955                 struct lustre_handle lockh;
1956                 struct ldlm_lock *lock;
1957
1958                 lockh.cookie = it->it_lock_handle;
1959                 lock = ldlm_handle2lock(&lockh);
1960                 LASSERT(lock);
1961                 if (ldlm_has_layout(lock)) {
1962                         struct cl_object_conf conf;
1963
1964                         memset(&conf, 0, sizeof(conf));
1965                         conf.coc_opc = OBJECT_CONF_SET;
1966                         conf.coc_inode = *inode;
1967                         conf.coc_lock = lock;
1968                         conf.u.coc_md = &md;
1969                         (void)ll_layout_conf(*inode, &conf);
1970                 }
1971                 LDLM_LOCK_PUT(lock);
1972         }
1973
1974 out:
1975         if (md.lsm)
1976                 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
1977         md_free_lustre_md(sbi->ll_md_exp, &md);
1978
1979 cleanup:
1980         if (rc != 0 && it && it->it_op & IT_OPEN)
1981                 ll_open_cleanup(sb ? sb : (*inode)->i_sb, req);
1982
1983         return rc;
1984 }
1985
1986 int ll_obd_statfs(struct inode *inode, void __user *arg)
1987 {
1988         struct ll_sb_info *sbi = NULL;
1989         struct obd_export *exp;
1990         char *buf = NULL;
1991         struct obd_ioctl_data *data = NULL;
1992         __u32 type;
1993         int len = 0, rc;
1994
1995         if (!inode) {
1996                 rc = -EINVAL;
1997                 goto out_statfs;
1998         }
1999
2000         sbi = ll_i2sbi(inode);
2001         if (!sbi) {
2002                 rc = -EINVAL;
2003                 goto out_statfs;
2004         }
2005
2006         rc = obd_ioctl_getdata(&buf, &len, arg);
2007         if (rc)
2008                 goto out_statfs;
2009
2010         data = (void *)buf;
2011         if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
2012             !data->ioc_pbuf1 || !data->ioc_pbuf2) {
2013                 rc = -EINVAL;
2014                 goto out_statfs;
2015         }
2016
2017         if (data->ioc_inllen1 != sizeof(__u32) ||
2018             data->ioc_inllen2 != sizeof(__u32) ||
2019             data->ioc_plen1 != sizeof(struct obd_statfs) ||
2020             data->ioc_plen2 != sizeof(struct obd_uuid)) {
2021                 rc = -EINVAL;
2022                 goto out_statfs;
2023         }
2024
2025         memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
2026         if (type & LL_STATFS_LMV) {
2027                 exp = sbi->ll_md_exp;
2028         } else if (type & LL_STATFS_LOV) {
2029                 exp = sbi->ll_dt_exp;
2030         } else {
2031                 rc = -ENODEV;
2032                 goto out_statfs;
2033         }
2034
2035         rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, NULL);
2036         if (rc)
2037                 goto out_statfs;
2038 out_statfs:
2039         if (buf)
2040                 obd_ioctl_freedata(buf, len);
2041         return rc;
2042 }
2043
2044 int ll_process_config(struct lustre_cfg *lcfg)
2045 {
2046         char *ptr;
2047         void *sb;
2048         struct lprocfs_static_vars lvars;
2049         unsigned long x;
2050         int rc = 0;
2051
2052         lprocfs_llite_init_vars(&lvars);
2053
2054         /* The instance name contains the sb: lustre-client-aacfe000 */
2055         ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
2056         if (!ptr || !*(++ptr))
2057                 return -EINVAL;
2058         rc = kstrtoul(ptr, 16, &x);
2059         if (rc != 0)
2060                 return -EINVAL;
2061         sb = (void *)x;
2062         /* This better be a real Lustre superblock! */
2063         LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
2064
2065         /* Note we have not called client_common_fill_super yet, so
2066          * proc fns must be able to handle that!
2067          */
2068         rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
2069                                       lcfg, sb);
2070         if (rc > 0)
2071                 rc = 0;
2072         return rc;
2073 }
2074
2075 /* this function prepares md_op_data hint for passing ot down to MD stack. */
2076 struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
2077                                       struct inode *i1, struct inode *i2,
2078                                       const char *name, int namelen,
2079                                       int mode, __u32 opc, void *data)
2080 {
2081         if (namelen > ll_i2sbi(i1)->ll_namelen)
2082                 return ERR_PTR(-ENAMETOOLONG);
2083
2084         if (!op_data)
2085                 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
2086
2087         if (!op_data)
2088                 return ERR_PTR(-ENOMEM);
2089
2090         ll_i2gids(op_data->op_suppgids, i1, i2);
2091         op_data->op_fid1 = *ll_inode2fid(i1);
2092
2093         if (i2)
2094                 op_data->op_fid2 = *ll_inode2fid(i2);
2095         else
2096                 fid_zero(&op_data->op_fid2);
2097
2098         op_data->op_name = name;
2099         op_data->op_namelen = namelen;
2100         op_data->op_mode = mode;
2101         op_data->op_mod_time = ktime_get_real_seconds();
2102         op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
2103         op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
2104         op_data->op_cap = cfs_curproc_cap_pack();
2105         op_data->op_bias = 0;
2106         op_data->op_cli_flags = 0;
2107         if ((opc == LUSTRE_OPC_CREATE) && name &&
2108             filename_is_volatile(name, namelen, NULL))
2109                 op_data->op_bias |= MDS_CREATE_VOLATILE;
2110         op_data->op_opc = opc;
2111         op_data->op_mds = 0;
2112         op_data->op_data = data;
2113
2114         /* If the file is being opened after mknod() (normally due to NFS)
2115          * try to use the default stripe data from parent directory for
2116          * allocating OST objects.  Try to pass the parent FID to MDS.
2117          */
2118         if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) &&
2119             !ll_i2info(i2)->lli_has_smd) {
2120                 struct ll_inode_info *lli = ll_i2info(i2);
2121
2122                 spin_lock(&lli->lli_lock);
2123                 if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid)))
2124                         op_data->op_fid1 = lli->lli_pfid;
2125                 spin_unlock(&lli->lli_lock);
2126         }
2127
2128         /* When called by ll_setattr_raw, file is i1. */
2129         if (ll_i2info(i1)->lli_flags & LLIF_DATA_MODIFIED)
2130                 op_data->op_bias |= MDS_DATA_MODIFIED;
2131
2132         return op_data;
2133 }
2134
2135 void ll_finish_md_op_data(struct md_op_data *op_data)
2136 {
2137         kfree(op_data);
2138 }
2139
2140 int ll_show_options(struct seq_file *seq, struct dentry *dentry)
2141 {
2142         struct ll_sb_info *sbi;
2143
2144         LASSERT(seq && dentry);
2145         sbi = ll_s2sbi(dentry->d_sb);
2146
2147         if (sbi->ll_flags & LL_SBI_NOLCK)
2148                 seq_puts(seq, ",nolock");
2149
2150         if (sbi->ll_flags & LL_SBI_FLOCK)
2151                 seq_puts(seq, ",flock");
2152
2153         if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
2154                 seq_puts(seq, ",localflock");
2155
2156         if (sbi->ll_flags & LL_SBI_USER_XATTR)
2157                 seq_puts(seq, ",user_xattr");
2158
2159         if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
2160                 seq_puts(seq, ",lazystatfs");
2161
2162         if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
2163                 seq_puts(seq, ",user_fid2path");
2164
2165         return 0;
2166 }
2167
2168 /**
2169  * Get obd name by cmd, and copy out to user space
2170  */
2171 int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
2172 {
2173         struct ll_sb_info *sbi = ll_i2sbi(inode);
2174         struct obd_device *obd;
2175
2176         if (cmd == OBD_IOC_GETDTNAME)
2177                 obd = class_exp2obd(sbi->ll_dt_exp);
2178         else if (cmd == OBD_IOC_GETMDNAME)
2179                 obd = class_exp2obd(sbi->ll_md_exp);
2180         else
2181                 return -EINVAL;
2182
2183         if (!obd)
2184                 return -ENOENT;
2185
2186         if (copy_to_user((void __user *)arg, obd->obd_name,
2187                          strlen(obd->obd_name) + 1))
2188                 return -EFAULT;
2189
2190         return 0;
2191 }
2192
2193 /**
2194  * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
2195  * fsname will be returned in this buffer; otherwise, a static buffer will be
2196  * used to store the fsname and returned to caller.
2197  */
2198 char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
2199 {
2200         static char fsname_static[MTI_NAME_MAXLEN];
2201         struct lustre_sb_info *lsi = s2lsi(sb);
2202         char *ptr;
2203         int len;
2204
2205         if (!buf) {
2206                 /* this means the caller wants to use static buffer
2207                  * and it doesn't care about race. Usually this is
2208                  * in error reporting path
2209                  */
2210                 buf = fsname_static;
2211                 buflen = sizeof(fsname_static);
2212         }
2213
2214         len = strlen(lsi->lsi_lmd->lmd_profile);
2215         ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
2216         if (ptr && (strcmp(ptr, "-client") == 0))
2217                 len -= 7;
2218
2219         if (unlikely(len >= buflen))
2220                 len = buflen - 1;
2221         strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
2222         buf[len] = '\0';
2223
2224         return buf;
2225 }
2226
2227 void ll_dirty_page_discard_warn(struct page *page, int ioret)
2228 {
2229         char *buf, *path = NULL;
2230         struct dentry *dentry = NULL;
2231         struct vvp_object *obj = cl_inode2vvp(page->mapping->host);
2232
2233         /* this can be called inside spin lock so use GFP_ATOMIC. */
2234         buf = (char *)__get_free_page(GFP_ATOMIC);
2235         if (buf) {
2236                 dentry = d_find_alias(page->mapping->host);
2237                 if (dentry)
2238                         path = dentry_path_raw(dentry, buf, PAGE_SIZE);
2239         }
2240
2241         CDEBUG(D_WARNING,
2242                "%s: dirty page discard: %s/fid: " DFID "/%s may get corrupted (rc %d)\n",
2243                ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
2244                s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
2245                PFID(&obj->vob_header.coh_lu.loh_fid),
2246                (path && !IS_ERR(path)) ? path : "", ioret);
2247
2248         if (dentry)
2249                 dput(dentry);
2250
2251         if (buf)
2252                 free_page((unsigned long)buf);
2253 }