fs/ceph/super.c

   1
   2 #include <linux/ceph/ceph_debug.h>
   3
   4 #include <linux/backing-dev.h>
   5 #include <linux/ctype.h>
   6 #include <linux/fs.h>
   7 #include <linux/inet.h>
   8 #include <linux/in6.h>
   9 #include <linux/module.h>
  10 #include <linux/mount.h>
  11 #include <linux/parser.h>
  12 #include <linux/sched.h>
  13 #include <linux/seq_file.h>
  14 #include <linux/slab.h>
  15 #include <linux/statfs.h>
  16 #include <linux/string.h>
  17
  18 #include "super.h"
  19 #include "mds_client.h"
  20 #include "cache.h"
  21
  22 #include <linux/ceph/ceph_features.h>
  23 #include <linux/ceph/decode.h>
  24 #include <linux/ceph/mon_client.h>
  25 #include <linux/ceph/auth.h>
  26 #include <linux/ceph/debugfs.h>
  27
  28 /*
  29  * Ceph superblock operations
  30  *
  31  * Handle the basics of mounting, unmounting.
  32  */
  33
  34 /*
  35  * super ops
  36  */
  37 static void ceph_put_super(struct super_block *s)
  38 {
  39         struct ceph_fs_client *fsc = ceph_sb_to_client(s);
  40
  41         dout("put_super\n");
  42         ceph_mdsc_close_sessions(fsc->mdsc);
  43 }
  44
  45 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  46 {
  47         struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
  48         struct ceph_monmap *monmap = fsc->client->monc.monmap;
  49         struct ceph_statfs st;
  50         u64 fsid;
  51         int err;
  52
  53         dout("statfs\n");
  54         err = ceph_monc_do_statfs(&fsc->client->monc, &st);
  55         if (err < 0)
  56                 return err;
  57
  58         /* fill in kstatfs */
  59         buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
  60
  61         /*
  62          * express utilization in terms of large blocks to avoid
  63          * overflow on 32-bit machines.
  64          *
  65          * NOTE: for the time being, we make bsize == frsize to humor
  66          * not-yet-ancient versions of glibc that are broken.
  67          * Someday, we will probably want to report a real block
  68          * size...  whatever that may mean for a network file system!
  69          */
  70         buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
  71         buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
  72         buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
  73         buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  74         buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  75
  76         buf->f_files = le64_to_cpu(st.num_objects);
  77         buf->f_ffree = -1;
  78         buf->f_namelen = NAME_MAX;
  79
  80         /* leave fsid little-endian, regardless of host endianness */
  81         fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
  82         buf->f_fsid.val[0] = fsid & 0xffffffff;
  83         buf->f_fsid.val[1] = fsid >> 32;
  84
  85         return 0;
  86 }
  87
  88
  89 static int ceph_sync_fs(struct super_block *sb, int wait)
  90 {
  91         struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
  92
  93         if (!wait) {
  94                 dout("sync_fs (non-blocking)\n");
  95                 ceph_flush_dirty_caps(fsc->mdsc);
  96                 dout("sync_fs (non-blocking) done\n");
  97                 return 0;
  98         }
  99
 100         dout("sync_fs (blocking)\n");
 101         ceph_osdc_sync(&fsc->client->osdc);
 102         ceph_mdsc_sync(fsc->mdsc);
 103         dout("sync_fs (blocking) done\n");
 104         return 0;
 105 }
 106
 107 /*
 108  * mount options
 109  */
 110 enum {
 111         Opt_wsize,
 112         Opt_rsize,
 113         Opt_rasize,
 114         Opt_caps_wanted_delay_min,
 115         Opt_caps_wanted_delay_max,
 116         Opt_cap_release_safety,
 117         Opt_readdir_max_entries,
 118         Opt_readdir_max_bytes,
 119         Opt_congestion_kb,
 120         Opt_last_int,
 121         /* int args above */
 122         Opt_snapdirname,
 123         Opt_last_string,
 124         /* string args above */
 125         Opt_dirstat,
 126         Opt_nodirstat,
 127         Opt_rbytes,
 128         Opt_norbytes,
 129         Opt_asyncreaddir,
 130         Opt_noasyncreaddir,
 131         Opt_dcache,
 132         Opt_nodcache,
 133         Opt_ino32,
 134         Opt_noino32,
 135         Opt_fscache,
 136         Opt_nofscache,
 137         Opt_poolperm,
 138         Opt_nopoolperm,
 139 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 140         Opt_acl,
 141 #endif
 142         Opt_noacl,
 143 };
 144
 145 static match_table_t fsopt_tokens = {
 146         {Opt_wsize, "wsize=%d"},
 147         {Opt_rsize, "rsize=%d"},
 148         {Opt_rasize, "rasize=%d"},
 149         {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 150         {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 151         {Opt_cap_release_safety, "cap_release_safety=%d"},
 152         {Opt_readdir_max_entries, "readdir_max_entries=%d"},
 153         {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 154         {Opt_congestion_kb, "write_congestion_kb=%d"},
 155         /* int args above */
 156         {Opt_snapdirname, "snapdirname=%s"},
 157         /* string args above */
 158         {Opt_dirstat, "dirstat"},
 159         {Opt_nodirstat, "nodirstat"},
 160         {Opt_rbytes, "rbytes"},
 161         {Opt_norbytes, "norbytes"},
 162         {Opt_asyncreaddir, "asyncreaddir"},
 163         {Opt_noasyncreaddir, "noasyncreaddir"},
 164         {Opt_dcache, "dcache"},
 165         {Opt_nodcache, "nodcache"},
 166         {Opt_ino32, "ino32"},
 167         {Opt_noino32, "noino32"},
 168         {Opt_fscache, "fsc"},
 169         {Opt_nofscache, "nofsc"},
 170         {Opt_poolperm, "poolperm"},
 171         {Opt_nopoolperm, "nopoolperm"},
 172 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 173         {Opt_acl, "acl"},
 174 #endif
 175         {Opt_noacl, "noacl"},
 176         {-1, NULL}
 177 };
 178
 179 static int parse_fsopt_token(char *c, void *private)
 180 {
 181         struct ceph_mount_options *fsopt = private;
 182         substring_t argstr[MAX_OPT_ARGS];
 183         int token, intval, ret;
 184
 185         token = match_token((char *)c, fsopt_tokens, argstr);
 186         if (token < 0)
 187                 return -EINVAL;
 188
 189         if (token < Opt_last_int) {
 190                 ret = match_int(&argstr[0], &intval);
 191                 if (ret < 0) {
 192                         pr_err("bad mount option arg (not int) "
 193                                "at '%s'\n", c);
 194                         return ret;
 195                 }
 196                 dout("got int token %d val %d\n", token, intval);
 197         } else if (token > Opt_last_int && token < Opt_last_string) {
 198                 dout("got string token %d val %s\n", token,
 199                      argstr[0].from);
 200         } else {
 201                 dout("got token %d\n", token);
 202         }
 203
 204         switch (token) {
 205         case Opt_snapdirname:
 206                 kfree(fsopt->snapdir_name);
 207                 fsopt->snapdir_name = kstrndup(argstr[0].from,
 208                                                argstr[0].to-argstr[0].from,
 209                                                GFP_KERNEL);
 210                 if (!fsopt->snapdir_name)
 211                         return -ENOMEM;
 212                 break;
 213
 214                 /* misc */
 215         case Opt_wsize:
 216                 fsopt->wsize = intval;
 217                 break;
 218         case Opt_rsize:
 219                 fsopt->rsize = intval;
 220                 break;
 221         case Opt_rasize:
 222                 fsopt->rasize = intval;
 223                 break;
 224         case Opt_caps_wanted_delay_min:
 225                 fsopt->caps_wanted_delay_min = intval;
 226                 break;
 227         case Opt_caps_wanted_delay_max:
 228                 fsopt->caps_wanted_delay_max = intval;
 229                 break;
 230         case Opt_readdir_max_entries:
 231                 fsopt->max_readdir = intval;
 232                 break;
 233         case Opt_readdir_max_bytes:
 234                 fsopt->max_readdir_bytes = intval;
 235                 break;
 236         case Opt_congestion_kb:
 237                 fsopt->congestion_kb = intval;
 238                 break;
 239         case Opt_dirstat:
 240                 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
 241                 break;
 242         case Opt_nodirstat:
 243                 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
 244                 break;
 245         case Opt_rbytes:
 246                 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
 247                 break;
 248         case Opt_norbytes:
 249                 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
 250                 break;
 251         case Opt_asyncreaddir:
 252                 fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
 253                 break;
 254         case Opt_noasyncreaddir:
 255                 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
 256                 break;
 257         case Opt_dcache:
 258                 fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
 259                 break;
 260         case Opt_nodcache:
 261                 fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
 262                 break;
 263         case Opt_ino32:
 264                 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
 265                 break;
 266         case Opt_noino32:
 267                 fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
 268                 break;
 269         case Opt_fscache:
 270                 fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
 271                 break;
 272         case Opt_nofscache:
 273                 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
 274                 break;
 275         case Opt_poolperm:
 276                 fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
 277                 printk ("pool perm");
 278                 break;
 279         case Opt_nopoolperm:
 280                 fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
 281                 break;
 282 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 283         case Opt_acl:
 284                 fsopt->sb_flags |= MS_POSIXACL;
 285                 break;
 286 #endif
 287         case Opt_noacl:
 288                 fsopt->sb_flags &= ~MS_POSIXACL;
 289                 break;
 290         default:
 291                 BUG_ON(token);
 292         }
 293         return 0;
 294 }
 295
 296 static void destroy_mount_options(struct ceph_mount_options *args)
 297 {
 298         dout("destroy_mount_options %p\n", args);
 299         kfree(args->snapdir_name);
 300         kfree(args);
 301 }
 302
 303 static int strcmp_null(const char *s1, const char *s2)
 304 {
 305         if (!s1 && !s2)
 306                 return 0;
 307         if (s1 && !s2)
 308                 return -1;
 309         if (!s1 && s2)
 310                 return 1;
 311         return strcmp(s1, s2);
 312 }
 313
 314 static int compare_mount_options(struct ceph_mount_options *new_fsopt,
 315                                  struct ceph_options *new_opt,
 316                                  struct ceph_fs_client *fsc)
 317 {
 318         struct ceph_mount_options *fsopt1 = new_fsopt;
 319         struct ceph_mount_options *fsopt2 = fsc->mount_options;
 320         int ofs = offsetof(struct ceph_mount_options, snapdir_name);
 321         int ret;
 322
 323         ret = memcmp(fsopt1, fsopt2, ofs);
 324         if (ret)
 325                 return ret;
 326
 327         ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
 328         if (ret)
 329                 return ret;
 330
 331         return ceph_compare_options(new_opt, fsc->client);
 332 }
 333
 334 static int parse_mount_options(struct ceph_mount_options **pfsopt,
 335                                struct ceph_options **popt,
 336                                int flags, char *options,
 337                                const char *dev_name,
 338                                const char **path)
 339 {
 340         struct ceph_mount_options *fsopt;
 341         const char *dev_name_end;
 342         int err;
 343
 344         if (!dev_name || !*dev_name)
 345                 return -EINVAL;
 346
 347         fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
 348         if (!fsopt)
 349                 return -ENOMEM;
 350
 351         dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
 352
 353         fsopt->sb_flags = flags;
 354         fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
 355
 356         fsopt->rsize = CEPH_RSIZE_DEFAULT;
 357         fsopt->rasize = CEPH_RASIZE_DEFAULT;
 358         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 359         if (!fsopt->snapdir_name) {
 360                 err = -ENOMEM;
 361                 goto out;
 362         }
 363
 364         fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 365         fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
 366         fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
 367         fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
 368         fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 369         fsopt->congestion_kb = default_congestion_kb();
 370
 371         /*
 372          * Distinguish the server list from the path in "dev_name".
 373          * Internally we do not include the leading '/' in the path.
 374          *
 375          * "dev_name" will look like:
 376          *     <server_spec>[,<server_spec>...]:[<path>]
 377          * where
 378          *     <server_spec> is <ip>[:<port>]
 379          *     <path> is optional, but if present must begin with '/'
 380          */
 381         dev_name_end = strchr(dev_name, '/');
 382         if (dev_name_end) {
 383                 /* skip over leading '/' for path */
 384                 *path = dev_name_end + 1;
 385         } else {
 386                 /* path is empty */
 387                 dev_name_end = dev_name + strlen(dev_name);
 388                 *path = dev_name_end;
 389         }
 390         err = -EINVAL;
 391         dev_name_end--;         /* back up to ':' separator */
 392         if (dev_name_end < dev_name || *dev_name_end != ':') {
 393                 pr_err("device name is missing path (no : separator in %s)\n",
 394                                 dev_name);
 395                 goto out;
 396         }
 397         dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 398         dout("server path '%s'\n", *path);
 399
 400         *popt = ceph_parse_options(options, dev_name, dev_name_end,
 401                                  parse_fsopt_token, (void *)fsopt);
 402         if (IS_ERR(*popt)) {
 403                 err = PTR_ERR(*popt);
 404                 goto out;
 405         }
 406
 407         /* success */
 408         *pfsopt = fsopt;
 409         return 0;
 410
 411 out:
 412         destroy_mount_options(fsopt);
 413         return err;
 414 }
 415
 416 /**
 417  * ceph_show_options - Show mount options in /proc/mounts
 418  * @m: seq_file to write to
 419  * @root: root of that (sub)tree
 420  */
 421 static int ceph_show_options(struct seq_file *m, struct dentry *root)
 422 {
 423         struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
 424         struct ceph_mount_options *fsopt = fsc->mount_options;
 425         size_t pos;
 426         int ret;
 427
 428         /* a comma between MNT/MS and client options */
 429         seq_putc(m, ',');
 430         pos = m->count;
 431
 432         ret = ceph_print_client_options(m, fsc->client);
 433         if (ret)
 434                 return ret;
 435
 436         /* retract our comma if no client options */
 437         if (m->count == pos)
 438                 m->count--;
 439
 440         if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
 441                 seq_puts(m, ",dirstat");
 442         if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
 443                 seq_puts(m, ",rbytes");
 444         if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
 445                 seq_puts(m, ",noasyncreaddir");
 446         if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
 447                 seq_puts(m, ",nodcache");
 448         if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
 449                 seq_puts(m, ",fsc");
 450         if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
 451                 seq_puts(m, ",nopoolperm");
 452
 453 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 454         if (fsopt->sb_flags & MS_POSIXACL)
 455                 seq_puts(m, ",acl");
 456         else
 457                 seq_puts(m, ",noacl");
 458 #endif
 459
 460         if (fsopt->wsize)
 461                 seq_printf(m, ",wsize=%d", fsopt->wsize);
 462         if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
 463                 seq_printf(m, ",rsize=%d", fsopt->rsize);
 464         if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
 465                 seq_printf(m, ",rasize=%d", fsopt->rasize);
 466         if (fsopt->congestion_kb != default_congestion_kb())
 467                 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
 468         if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
 469                 seq_printf(m, ",caps_wanted_delay_min=%d",
 470                          fsopt->caps_wanted_delay_min);
 471         if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
 472                 seq_printf(m, ",caps_wanted_delay_max=%d",
 473                            fsopt->caps_wanted_delay_max);
 474         if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
 475                 seq_printf(m, ",cap_release_safety=%d",
 476                            fsopt->cap_release_safety);
 477         if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
 478                 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
 479         if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
 480                 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
 481         if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
 482                 seq_show_option(m, "snapdirname", fsopt->snapdir_name);
 483
 484         return 0;
 485 }
 486
 487 /*
 488  * handle any mon messages the standard library doesn't understand.
 489  * return error if we don't either.
 490  */
 491 static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 492 {
 493         struct ceph_fs_client *fsc = client->private;
 494         int type = le16_to_cpu(msg->hdr.type);
 495
 496         switch (type) {
 497         case CEPH_MSG_MDS_MAP:
 498                 ceph_mdsc_handle_map(fsc->mdsc, msg);
 499                 return 0;
 500
 501         default:
 502                 return -1;
 503         }
 504 }
 505
 506 /*
 507  * create a new fs client
 508  */
 509 static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 510                                         struct ceph_options *opt)
 511 {
 512         struct ceph_fs_client *fsc;
 513         const u64 supported_features =
 514                 CEPH_FEATURE_FLOCK |
 515                 CEPH_FEATURE_DIRLAYOUTHASH |
 516                 CEPH_FEATURE_MDS_INLINE_DATA;
 517         const u64 required_features = 0;
 518         int page_count;
 519         size_t size;
 520         int err = -ENOMEM;
 521
 522         fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
 523         if (!fsc)
 524                 return ERR_PTR(-ENOMEM);
 525
 526         fsc->client = ceph_create_client(opt, fsc, supported_features,
 527                                          required_features);
 528         if (IS_ERR(fsc->client)) {
 529                 err = PTR_ERR(fsc->client);
 530                 goto fail;
 531         }
 532         fsc->client->extra_mon_dispatch = extra_mon_dispatch;
 533         ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
 534
 535         fsc->mount_options = fsopt;
 536
 537         fsc->sb = NULL;
 538         fsc->mount_state = CEPH_MOUNT_MOUNTING;
 539
 540         atomic_long_set(&fsc->writeback_count, 0);
 541
 542         err = bdi_init(&fsc->backing_dev_info);
 543         if (err < 0)
 544                 goto fail_client;
 545
 546         err = -ENOMEM;
 547         /*
 548          * The number of concurrent works can be high but they don't need
 549          * to be processed in parallel, limit concurrency.
 550          */
 551         fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
 552         if (fsc->wb_wq == NULL)
 553                 goto fail_bdi;
 554         fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
 555         if (fsc->pg_inv_wq == NULL)
 556                 goto fail_wb_wq;
 557         fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
 558         if (fsc->trunc_wq == NULL)
 559                 goto fail_pg_inv_wq;
 560
 561         /* set up mempools */
 562         err = -ENOMEM;
 563         page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
 564         size = sizeof (struct page *) * (page_count ? page_count : 1);
 565         fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
 566         if (!fsc->wb_pagevec_pool)
 567                 goto fail_trunc_wq;
 568
 569         /* setup fscache */
 570         if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
 571             (ceph_fscache_register_fs(fsc) != 0))
 572                 goto fail_fscache;
 573
 574         /* caps */
 575         fsc->min_caps = fsopt->max_readdir;
 576
 577         return fsc;
 578
 579 fail_fscache:
 580         ceph_fscache_unregister_fs(fsc);
 581 fail_trunc_wq:
 582         destroy_workqueue(fsc->trunc_wq);
 583 fail_pg_inv_wq:
 584         destroy_workqueue(fsc->pg_inv_wq);
 585 fail_wb_wq:
 586         destroy_workqueue(fsc->wb_wq);
 587 fail_bdi:
 588         bdi_destroy(&fsc->backing_dev_info);
 589 fail_client:
 590         ceph_destroy_client(fsc->client);
 591 fail:
 592         kfree(fsc);
 593         return ERR_PTR(err);
 594 }
 595
 596 static void destroy_fs_client(struct ceph_fs_client *fsc)
 597 {
 598         dout("destroy_fs_client %p\n", fsc);
 599
 600         ceph_fscache_unregister_fs(fsc);
 601
 602         destroy_workqueue(fsc->wb_wq);
 603         destroy_workqueue(fsc->pg_inv_wq);
 604         destroy_workqueue(fsc->trunc_wq);
 605
 606         bdi_destroy(&fsc->backing_dev_info);
 607
 608         mempool_destroy(fsc->wb_pagevec_pool);
 609
 610         destroy_mount_options(fsc->mount_options);
 611
 612         ceph_fs_debugfs_cleanup(fsc);
 613
 614         ceph_destroy_client(fsc->client);
 615
 616         kfree(fsc);
 617         dout("destroy_fs_client %p done\n", fsc);
 618 }
 619
 620 /*
 621  * caches
 622  */
 623 struct kmem_cache *ceph_inode_cachep;
 624 struct kmem_cache *ceph_cap_cachep;
 625 struct kmem_cache *ceph_cap_flush_cachep;
 626 struct kmem_cache *ceph_dentry_cachep;
 627 struct kmem_cache *ceph_file_cachep;
 628
 629 static void ceph_inode_init_once(void *foo)
 630 {
 631         struct ceph_inode_info *ci = foo;
 632         inode_init_once(&ci->vfs_inode);
 633 }
 634
 635 static int __init init_caches(void)
 636 {
 637         int error = -ENOMEM;
 638
 639         ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 640                                       sizeof(struct ceph_inode_info),
 641                                       __alignof__(struct ceph_inode_info),
 642                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
 643                                       SLAB_ACCOUNT, ceph_inode_init_once);
 644         if (ceph_inode_cachep == NULL)
 645                 return -ENOMEM;
 646
 647         ceph_cap_cachep = KMEM_CACHE(ceph_cap,
 648                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 649         if (ceph_cap_cachep == NULL)
 650                 goto bad_cap;
 651         ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
 652                                            SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 653         if (ceph_cap_flush_cachep == NULL)
 654                 goto bad_cap_flush;
 655
 656         ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
 657                                         SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 658         if (ceph_dentry_cachep == NULL)
 659                 goto bad_dentry;
 660
 661         ceph_file_cachep = KMEM_CACHE(ceph_file_info,
 662                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 663         if (ceph_file_cachep == NULL)
 664                 goto bad_file;
 665
 666         if ((error = ceph_fscache_register()))
 667                 goto bad_file;
 668
 669         return 0;
 670 bad_file:
 671         kmem_cache_destroy(ceph_dentry_cachep);
 672 bad_dentry:
 673         kmem_cache_destroy(ceph_cap_flush_cachep);
 674 bad_cap_flush:
 675         kmem_cache_destroy(ceph_cap_cachep);
 676 bad_cap:
 677         kmem_cache_destroy(ceph_inode_cachep);
 678         return error;
 679 }
 680
 681 static void destroy_caches(void)
 682 {
 683         /*
 684          * Make sure all delayed rcu free inodes are flushed before we
 685          * destroy cache.
 686          */
 687         rcu_barrier();
 688
 689         kmem_cache_destroy(ceph_inode_cachep);
 690         kmem_cache_destroy(ceph_cap_cachep);
 691         kmem_cache_destroy(ceph_cap_flush_cachep);
 692         kmem_cache_destroy(ceph_dentry_cachep);
 693         kmem_cache_destroy(ceph_file_cachep);
 694
 695         ceph_fscache_unregister();
 696 }
 697
 698
 699 /*
 700  * ceph_umount_begin - initiate forced umount.  Tear down down the
 701  * mount, skipping steps that may hang while waiting for server(s).
 702  */
 703 static void ceph_umount_begin(struct super_block *sb)
 704 {
 705         struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 706
 707         dout("ceph_umount_begin - starting forced umount\n");
 708         if (!fsc)
 709                 return;
 710         fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
 711         ceph_mdsc_force_umount(fsc->mdsc);
 712         return;
 713 }
 714
 715 static const struct super_operations ceph_super_ops = {
 716         .alloc_inode    = ceph_alloc_inode,
 717         .destroy_inode  = ceph_destroy_inode,
 718         .write_inode    = ceph_write_inode,
 719         .drop_inode     = ceph_drop_inode,
 720         .sync_fs        = ceph_sync_fs,
 721         .put_super      = ceph_put_super,
 722         .show_options   = ceph_show_options,
 723         .statfs         = ceph_statfs,
 724         .umount_begin   = ceph_umount_begin,
 725 };
 726
 727 /*
 728  * Bootstrap mount by opening the root directory.  Note the mount
 729  * @started time from caller, and time out if this takes too long.
 730  */
 731 static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 732                                        const char *path,
 733                                        unsigned long started)
 734 {
 735         struct ceph_mds_client *mdsc = fsc->mdsc;
 736         struct ceph_mds_request *req = NULL;
 737         int err;
 738         struct dentry *root;
 739
 740         /* open dir */
 741         dout("open_root_inode opening '%s'\n", path);
 742         req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
 743         if (IS_ERR(req))
 744                 return ERR_CAST(req);
 745         req->r_path1 = kstrdup(path, GFP_NOFS);
 746         if (!req->r_path1) {
 747                 root = ERR_PTR(-ENOMEM);
 748                 goto out;
 749         }
 750
 751         req->r_ino1.ino = CEPH_INO_ROOT;
 752         req->r_ino1.snap = CEPH_NOSNAP;
 753         req->r_started = started;
 754         req->r_timeout = fsc->client->options->mount_timeout;
 755         req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 756         req->r_num_caps = 2;
 757         err = ceph_mdsc_do_request(mdsc, NULL, req);
 758         if (err == 0) {
 759                 struct inode *inode = req->r_target_inode;
 760                 req->r_target_inode = NULL;
 761                 dout("open_root_inode success\n");
 762                 if (ceph_ino(inode) == CEPH_INO_ROOT &&
 763                     fsc->sb->s_root == NULL) {
 764                         root = d_make_root(inode);
 765                         if (!root) {
 766                                 root = ERR_PTR(-ENOMEM);
 767                                 goto out;
 768                         }
 769                 } else {
 770                         root = d_obtain_root(inode);
 771                 }
 772                 ceph_init_dentry(root);
 773                 dout("open_root_inode success, root dentry is %p\n", root);
 774         } else {
 775                 root = ERR_PTR(err);
 776         }
 777 out:
 778         ceph_mdsc_put_request(req);
 779         return root;
 780 }
 781
 782
 783
 784
 785 /*
 786  * mount: join the ceph cluster, and open root directory.
 787  */
 788 static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 789                       const char *path)
 790 {
 791         int err;
 792         unsigned long started = jiffies;  /* note the start time */
 793         struct dentry *root;
 794         int first = 0;   /* first vfsmount for this super_block */
 795
 796         dout("mount start %p\n", fsc);
 797         mutex_lock(&fsc->client->mount_mutex);
 798
 799         if (!fsc->sb->s_root) {
 800                 err = __ceph_open_session(fsc->client, started);
 801                 if (err < 0)
 802                         goto out;
 803
 804                 dout("mount opening root\n");
 805                 root = open_root_dentry(fsc, "", started);
 806                 if (IS_ERR(root)) {
 807                         err = PTR_ERR(root);
 808                         goto out;
 809                 }
 810                 fsc->sb->s_root = root;
 811                 first = 1;
 812
 813                 err = ceph_fs_debugfs_init(fsc);
 814                 if (err < 0)
 815                         goto fail;
 816         }
 817
 818         if (path[0] == 0) {
 819                 root = fsc->sb->s_root;
 820                 dget(root);
 821         } else {
 822                 dout("mount opening base mountpoint\n");
 823                 root = open_root_dentry(fsc, path, started);
 824                 if (IS_ERR(root)) {
 825                         err = PTR_ERR(root);
 826                         goto fail;
 827                 }
 828         }
 829
 830         fsc->mount_state = CEPH_MOUNT_MOUNTED;
 831         dout("mount success\n");
 832         mutex_unlock(&fsc->client->mount_mutex);
 833         return root;
 834
 835 fail:
 836         if (first) {
 837                 dput(fsc->sb->s_root);
 838                 fsc->sb->s_root = NULL;
 839         }
 840 out:
 841         mutex_unlock(&fsc->client->mount_mutex);
 842         return ERR_PTR(err);
 843 }
 844
 845 static int ceph_set_super(struct super_block *s, void *data)
 846 {
 847         struct ceph_fs_client *fsc = data;
 848         int ret;
 849
 850         dout("set_super %p data %p\n", s, data);
 851
 852         s->s_flags = fsc->mount_options->sb_flags;
 853         s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 854
 855         s->s_xattr = ceph_xattr_handlers;
 856         s->s_fs_info = fsc;
 857         fsc->sb = s;
 858
 859         s->s_op = &ceph_super_ops;
 860         s->s_export_op = &ceph_export_ops;
 861
 862         s->s_time_gran = 1000;  /* 1000 ns == 1 us */
 863
 864         ret = set_anon_super(s, NULL);  /* what is that second arg for? */
 865         if (ret != 0)
 866                 goto fail;
 867
 868         return ret;
 869
 870 fail:
 871         s->s_fs_info = NULL;
 872         fsc->sb = NULL;
 873         return ret;
 874 }
 875
 876 /*
 877  * share superblock if same fs AND options
 878  */
 879 static int ceph_compare_super(struct super_block *sb, void *data)
 880 {
 881         struct ceph_fs_client *new = data;
 882         struct ceph_mount_options *fsopt = new->mount_options;
 883         struct ceph_options *opt = new->client->options;
 884         struct ceph_fs_client *other = ceph_sb_to_client(sb);
 885
 886         dout("ceph_compare_super %p\n", sb);
 887
 888         if (compare_mount_options(fsopt, opt, other)) {
 889                 dout("monitor(s)/mount options don't match\n");
 890                 return 0;
 891         }
 892         if ((opt->flags & CEPH_OPT_FSID) &&
 893             ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
 894                 dout("fsid doesn't match\n");
 895                 return 0;
 896         }
 897         if (fsopt->sb_flags != other->mount_options->sb_flags) {
 898                 dout("flags differ\n");
 899                 return 0;
 900         }
 901         return 1;
 902 }
 903
 904 /*
 905  * construct our own bdi so we can control readahead, etc.
 906  */
 907 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 908
 909 static int ceph_register_bdi(struct super_block *sb,
 910                              struct ceph_fs_client *fsc)
 911 {
 912         int err;
 913
 914         /* set ra_pages based on rasize mount option? */
 915         if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
 916                 fsc->backing_dev_info.ra_pages =
 917                         (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
 918                         >> PAGE_SHIFT;
 919         else
 920                 fsc->backing_dev_info.ra_pages =
 921                         VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 922
 923         err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 924                            atomic_long_inc_return(&bdi_seq));
 925         if (!err)
 926                 sb->s_bdi = &fsc->backing_dev_info;
 927         return err;
 928 }
 929
 930 static struct dentry *ceph_mount(struct file_system_type *fs_type,
 931                        int flags, const char *dev_name, void *data)
 932 {
 933         struct super_block *sb;
 934         struct ceph_fs_client *fsc;
 935         struct dentry *res;
 936         int err;
 937         int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 938         const char *path = NULL;
 939         struct ceph_mount_options *fsopt = NULL;
 940         struct ceph_options *opt = NULL;
 941
 942         dout("ceph_mount\n");
 943
 944 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 945         flags |= MS_POSIXACL;
 946 #endif
 947         err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
 948         if (err < 0) {
 949                 res = ERR_PTR(err);
 950                 goto out_final;
 951         }
 952
 953         /* create client (which we may/may not use) */
 954         fsc = create_fs_client(fsopt, opt);
 955         if (IS_ERR(fsc)) {
 956                 res = ERR_CAST(fsc);
 957                 destroy_mount_options(fsopt);
 958                 ceph_destroy_options(opt);
 959                 goto out_final;
 960         }
 961
 962         err = ceph_mdsc_init(fsc);
 963         if (err < 0) {
 964                 res = ERR_PTR(err);
 965                 goto out;
 966         }
 967
 968         if (ceph_test_opt(fsc->client, NOSHARE))
 969                 compare_super = NULL;
 970         sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
 971         if (IS_ERR(sb)) {
 972                 res = ERR_CAST(sb);
 973                 goto out;
 974         }
 975
 976         if (ceph_sb_to_client(sb) != fsc) {
 977                 ceph_mdsc_destroy(fsc);
 978                 destroy_fs_client(fsc);
 979                 fsc = ceph_sb_to_client(sb);
 980                 dout("get_sb got existing client %p\n", fsc);
 981         } else {
 982                 dout("get_sb using new client %p\n", fsc);
 983                 err = ceph_register_bdi(sb, fsc);
 984                 if (err < 0) {
 985                         res = ERR_PTR(err);
 986                         goto out_splat;
 987                 }
 988         }
 989
 990         res = ceph_real_mount(fsc, path);
 991         if (IS_ERR(res))
 992                 goto out_splat;
 993         dout("root %p inode %p ino %llx.%llx\n", res,
 994              d_inode(res), ceph_vinop(d_inode(res)));
 995         return res;
 996
 997 out_splat:
 998         ceph_mdsc_close_sessions(fsc->mdsc);
 999         deactivate_locked_super(sb);
1000         goto out_final;
1001
1002 out:
1003         ceph_mdsc_destroy(fsc);
1004         destroy_fs_client(fsc);
1005 out_final:
1006         dout("ceph_mount fail %ld\n", PTR_ERR(res));
1007         return res;
1008 }
1009
1010 static void ceph_kill_sb(struct super_block *s)
1011 {
1012         struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1013         dev_t dev = s->s_dev;
1014
1015         dout("kill_sb %p\n", s);
1016
1017         ceph_mdsc_pre_umount(fsc->mdsc);
1018         generic_shutdown_super(s);
1019         ceph_mdsc_destroy(fsc);
1020
1021         destroy_fs_client(fsc);
1022         free_anon_bdev(dev);
1023 }
1024
1025 static struct file_system_type ceph_fs_type = {
1026         .owner          = THIS_MODULE,
1027         .name           = "ceph",
1028         .mount          = ceph_mount,
1029         .kill_sb        = ceph_kill_sb,
1030         .fs_flags       = FS_RENAME_DOES_D_MOVE,
1031 };
1032 MODULE_ALIAS_FS("ceph");
1033
1034 static int __init init_ceph(void)
1035 {
1036         int ret = init_caches();
1037         if (ret)
1038                 goto out;
1039
1040         ceph_flock_init();
1041         ceph_xattr_init();
1042         ret = register_filesystem(&ceph_fs_type);
1043         if (ret)
1044                 goto out_xattr;
1045
1046         pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1047
1048         return 0;
1049
1050 out_xattr:
1051         ceph_xattr_exit();
1052         destroy_caches();
1053 out:
1054         return ret;
1055 }
1056
1057 static void __exit exit_ceph(void)
1058 {
1059         dout("exit_ceph\n");
1060         unregister_filesystem(&ceph_fs_type);
1061         ceph_xattr_exit();
1062         destroy_caches();
1063 }
1064
1065 module_init(init_ceph);
1066 module_exit(exit_ceph);
1067
1068 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1069 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1070 MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1071 MODULE_DESCRIPTION("Ceph filesystem for Linux");
1072 MODULE_LICENSE("GPL");