kernel/bpf/syscall.c

   1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
   2  *
   3  * This program is free software; you can redistribute it and/or
   4  * modify it under the terms of version 2 of the GNU General Public
   5  * License as published by the Free Software Foundation.
   6  *
   7  * This program is distributed in the hope that it will be useful, but
   8  * WITHOUT ANY WARRANTY; without even the implied warranty of
   9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10  * General Public License for more details.
  11  */
  12 #include <linux/bpf.h>
  13 #include <linux/syscalls.h>
  14 #include <linux/slab.h>
  15 #include <linux/anon_inodes.h>
  16 #include <linux/file.h>
  17 #include <linux/license.h>
  18 #include <linux/filter.h>
  19
  20 static LIST_HEAD(bpf_map_types);
  21
  22 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  23 {
  24         struct bpf_map_type_list *tl;
  25         struct bpf_map *map;
  26
  27         list_for_each_entry(tl, &bpf_map_types, list_node) {
  28                 if (tl->type == attr->map_type) {
  29                         map = tl->ops->map_alloc(attr);
  30                         if (IS_ERR(map))
  31                                 return map;
  32                         map->ops = tl->ops;
  33                         map->map_type = attr->map_type;
  34                         return map;
  35                 }
  36         }
  37         return ERR_PTR(-EINVAL);
  38 }
  39
  40 /* boot time registration of different map implementations */
  41 void bpf_register_map_type(struct bpf_map_type_list *tl)
  42 {
  43         list_add(&tl->list_node, &bpf_map_types);
  44 }
  45
  46 /* called from workqueue */
  47 static void bpf_map_free_deferred(struct work_struct *work)
  48 {
  49         struct bpf_map *map = container_of(work, struct bpf_map, work);
  50
  51         /* implementation dependent freeing */
  52         map->ops->map_free(map);
  53 }
  54
  55 /* decrement map refcnt and schedule it for freeing via workqueue
  56  * (unrelying map implementation ops->map_free() might sleep)
  57  */
  58 void bpf_map_put(struct bpf_map *map)
  59 {
  60         if (atomic_dec_and_test(&map->refcnt)) {
  61                 INIT_WORK(&map->work, bpf_map_free_deferred);
  62                 schedule_work(&map->work);
  63         }
  64 }
  65
  66 static int bpf_map_release(struct inode *inode, struct file *filp)
  67 {
  68         struct bpf_map *map = filp->private_data;
  69
  70         bpf_map_put(map);
  71         return 0;
  72 }
  73
  74 static const struct file_operations bpf_map_fops = {
  75         .release = bpf_map_release,
  76 };
  77
  78 /* helper macro to check that unused fields 'union bpf_attr' are zero */
  79 #define CHECK_ATTR(CMD) \
  80         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  81                    sizeof(attr->CMD##_LAST_FIELD), 0, \
  82                    sizeof(*attr) - \
  83                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  84                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
  85
  86 #define BPF_MAP_CREATE_LAST_FIELD max_entries
  87 /* called via syscall */
  88 static int map_create(union bpf_attr *attr)
  89 {
  90         struct bpf_map *map;
  91         int err;
  92
  93         err = CHECK_ATTR(BPF_MAP_CREATE);
  94         if (err)
  95                 return -EINVAL;
  96
  97         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  98         map = find_and_alloc_map(attr);
  99         if (IS_ERR(map))
 100                 return PTR_ERR(map);
 101
 102         atomic_set(&map->refcnt, 1);
 103
 104         err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
 105
 106         if (err < 0)
 107                 /* failed to allocate fd */
 108                 goto free_map;
 109
 110         return err;
 111
 112 free_map:
 113         map->ops->map_free(map);
 114         return err;
 115 }
 116
 117 /* if error is returned, fd is released.
 118  * On success caller should complete fd access with matching fdput()
 119  */
 120 struct bpf_map *bpf_map_get(struct fd f)
 121 {
 122         struct bpf_map *map;
 123
 124         if (!f.file)
 125                 return ERR_PTR(-EBADF);
 126
 127         if (f.file->f_op != &bpf_map_fops) {
 128                 fdput(f);
 129                 return ERR_PTR(-EINVAL);
 130         }
 131
 132         map = f.file->private_data;
 133
 134         return map;
 135 }
 136
 137 /* helper to convert user pointers passed inside __aligned_u64 fields */
 138 static void __user *u64_to_ptr(__u64 val)
 139 {
 140         return (void __user *) (unsigned long) val;
 141 }
 142
 143 /* last field in 'union bpf_attr' used by this command */
 144 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
 145
 146 static int map_lookup_elem(union bpf_attr *attr)
 147 {
 148         void __user *ukey = u64_to_ptr(attr->key);
 149         void __user *uvalue = u64_to_ptr(attr->value);
 150         int ufd = attr->map_fd;
 151         struct fd f = fdget(ufd);
 152         struct bpf_map *map;
 153         void *key, *value, *ptr;
 154         int err;
 155
 156         if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
 157                 return -EINVAL;
 158
 159         map = bpf_map_get(f);
 160         if (IS_ERR(map))
 161                 return PTR_ERR(map);
 162
 163         err = -ENOMEM;
 164         key = kmalloc(map->key_size, GFP_USER);
 165         if (!key)
 166                 goto err_put;
 167
 168         err = -EFAULT;
 169         if (copy_from_user(key, ukey, map->key_size) != 0)
 170                 goto free_key;
 171
 172         err = -ENOMEM;
 173         value = kmalloc(map->value_size, GFP_USER);
 174         if (!value)
 175                 goto free_key;
 176
 177         rcu_read_lock();
 178         ptr = map->ops->map_lookup_elem(map, key);
 179         if (ptr)
 180                 memcpy(value, ptr, map->value_size);
 181         rcu_read_unlock();
 182
 183         err = -ENOENT;
 184         if (!ptr)
 185                 goto free_value;
 186
 187         err = -EFAULT;
 188         if (copy_to_user(uvalue, value, map->value_size) != 0)
 189                 goto free_value;
 190
 191         err = 0;
 192
 193 free_value:
 194         kfree(value);
 195 free_key:
 196         kfree(key);
 197 err_put:
 198         fdput(f);
 199         return err;
 200 }
 201
 202 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 203
 204 static int map_update_elem(union bpf_attr *attr)
 205 {
 206         void __user *ukey = u64_to_ptr(attr->key);
 207         void __user *uvalue = u64_to_ptr(attr->value);
 208         int ufd = attr->map_fd;
 209         struct fd f = fdget(ufd);
 210         struct bpf_map *map;
 211         void *key, *value;
 212         int err;
 213
 214         if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
 215                 return -EINVAL;
 216
 217         map = bpf_map_get(f);
 218         if (IS_ERR(map))
 219                 return PTR_ERR(map);
 220
 221         err = -ENOMEM;
 222         key = kmalloc(map->key_size, GFP_USER);
 223         if (!key)
 224                 goto err_put;
 225
 226         err = -EFAULT;
 227         if (copy_from_user(key, ukey, map->key_size) != 0)
 228                 goto free_key;
 229
 230         err = -ENOMEM;
 231         value = kmalloc(map->value_size, GFP_USER);
 232         if (!value)
 233                 goto free_key;
 234
 235         err = -EFAULT;
 236         if (copy_from_user(value, uvalue, map->value_size) != 0)
 237                 goto free_value;
 238
 239         /* eBPF program that use maps are running under rcu_read_lock(),
 240          * therefore all map accessors rely on this fact, so do the same here
 241          */
 242         rcu_read_lock();
 243         err = map->ops->map_update_elem(map, key, value, attr->flags);
 244         rcu_read_unlock();
 245
 246 free_value:
 247         kfree(value);
 248 free_key:
 249         kfree(key);
 250 err_put:
 251         fdput(f);
 252         return err;
 253 }
 254
 255 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
 256
 257 static int map_delete_elem(union bpf_attr *attr)
 258 {
 259         void __user *ukey = u64_to_ptr(attr->key);
 260         int ufd = attr->map_fd;
 261         struct fd f = fdget(ufd);
 262         struct bpf_map *map;
 263         void *key;
 264         int err;
 265
 266         if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
 267                 return -EINVAL;
 268
 269         map = bpf_map_get(f);
 270         if (IS_ERR(map))
 271                 return PTR_ERR(map);
 272
 273         err = -ENOMEM;
 274         key = kmalloc(map->key_size, GFP_USER);
 275         if (!key)
 276                 goto err_put;
 277
 278         err = -EFAULT;
 279         if (copy_from_user(key, ukey, map->key_size) != 0)
 280                 goto free_key;
 281
 282         rcu_read_lock();
 283         err = map->ops->map_delete_elem(map, key);
 284         rcu_read_unlock();
 285
 286 free_key:
 287         kfree(key);
 288 err_put:
 289         fdput(f);
 290         return err;
 291 }
 292
 293 /* last field in 'union bpf_attr' used by this command */
 294 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
 295
 296 static int map_get_next_key(union bpf_attr *attr)
 297 {
 298         void __user *ukey = u64_to_ptr(attr->key);
 299         void __user *unext_key = u64_to_ptr(attr->next_key);
 300         int ufd = attr->map_fd;
 301         struct fd f = fdget(ufd);
 302         struct bpf_map *map;
 303         void *key, *next_key;
 304         int err;
 305
 306         if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
 307                 return -EINVAL;
 308
 309         map = bpf_map_get(f);
 310         if (IS_ERR(map))
 311                 return PTR_ERR(map);
 312
 313         err = -ENOMEM;
 314         key = kmalloc(map->key_size, GFP_USER);
 315         if (!key)
 316                 goto err_put;
 317
 318         err = -EFAULT;
 319         if (copy_from_user(key, ukey, map->key_size) != 0)
 320                 goto free_key;
 321
 322         err = -ENOMEM;
 323         next_key = kmalloc(map->key_size, GFP_USER);
 324         if (!next_key)
 325                 goto free_key;
 326
 327         rcu_read_lock();
 328         err = map->ops->map_get_next_key(map, key, next_key);
 329         rcu_read_unlock();
 330         if (err)
 331                 goto free_next_key;
 332
 333         err = -EFAULT;
 334         if (copy_to_user(unext_key, next_key, map->key_size) != 0)
 335                 goto free_next_key;
 336
 337         err = 0;
 338
 339 free_next_key:
 340         kfree(next_key);
 341 free_key:
 342         kfree(key);
 343 err_put:
 344         fdput(f);
 345         return err;
 346 }
 347
 348 static LIST_HEAD(bpf_prog_types);
 349
 350 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 351 {
 352         struct bpf_prog_type_list *tl;
 353
 354         list_for_each_entry(tl, &bpf_prog_types, list_node) {
 355                 if (tl->type == type) {
 356                         prog->aux->ops = tl->ops;
 357                         prog->type = type;
 358                         return 0;
 359                 }
 360         }
 361
 362         return -EINVAL;
 363 }
 364
 365 void bpf_register_prog_type(struct bpf_prog_type_list *tl)
 366 {
 367         list_add(&tl->list_node, &bpf_prog_types);
 368 }
 369
 370 /* fixup insn->imm field of bpf_call instructions:
 371  * if (insn->imm == BPF_FUNC_map_lookup_elem)
 372  *      insn->imm = bpf_map_lookup_elem - __bpf_call_base;
 373  * else if (insn->imm == BPF_FUNC_map_update_elem)
 374  *      insn->imm = bpf_map_update_elem - __bpf_call_base;
 375  * else ...
 376  *
 377  * this function is called after eBPF program passed verification
 378  */
 379 static void fixup_bpf_calls(struct bpf_prog *prog)
 380 {
 381         const struct bpf_func_proto *fn;
 382         int i;
 383
 384         for (i = 0; i < prog->len; i++) {
 385                 struct bpf_insn *insn = &prog->insnsi[i];
 386
 387                 if (insn->code == (BPF_JMP | BPF_CALL)) {
 388                         /* we reach here when program has bpf_call instructions
 389                          * and it passed bpf_check(), means that
 390                          * ops->get_func_proto must have been supplied, check it
 391                          */
 392                         BUG_ON(!prog->aux->ops->get_func_proto);
 393
 394                         fn = prog->aux->ops->get_func_proto(insn->imm);
 395                         /* all functions that have prototype and verifier allowed
 396                          * programs to call them, must be real in-kernel functions
 397                          */
 398                         BUG_ON(!fn->func);
 399                         insn->imm = fn->func - __bpf_call_base;
 400                 }
 401         }
 402 }
 403
 404 /* drop refcnt on maps used by eBPF program and free auxilary data */
 405 static void free_used_maps(struct bpf_prog_aux *aux)
 406 {
 407         int i;
 408
 409         for (i = 0; i < aux->used_map_cnt; i++)
 410                 bpf_map_put(aux->used_maps[i]);
 411
 412         kfree(aux->used_maps);
 413 }
 414
 415 void bpf_prog_put(struct bpf_prog *prog)
 416 {
 417         if (atomic_dec_and_test(&prog->aux->refcnt)) {
 418                 free_used_maps(prog->aux);
 419                 bpf_prog_free(prog);
 420         }
 421 }
 422 EXPORT_SYMBOL_GPL(bpf_prog_put);
 423
 424 static int bpf_prog_release(struct inode *inode, struct file *filp)
 425 {
 426         struct bpf_prog *prog = filp->private_data;
 427
 428         bpf_prog_put(prog);
 429         return 0;
 430 }
 431
 432 static const struct file_operations bpf_prog_fops = {
 433         .release = bpf_prog_release,
 434 };
 435
 436 static struct bpf_prog *get_prog(struct fd f)
 437 {
 438         struct bpf_prog *prog;
 439
 440         if (!f.file)
 441                 return ERR_PTR(-EBADF);
 442
 443         if (f.file->f_op != &bpf_prog_fops) {
 444                 fdput(f);
 445                 return ERR_PTR(-EINVAL);
 446         }
 447
 448         prog = f.file->private_data;
 449
 450         return prog;
 451 }
 452
 453 /* called by sockets/tracing/seccomp before attaching program to an event
 454  * pairs with bpf_prog_put()
 455  */
 456 struct bpf_prog *bpf_prog_get(u32 ufd)
 457 {
 458         struct fd f = fdget(ufd);
 459         struct bpf_prog *prog;
 460
 461         prog = get_prog(f);
 462
 463         if (IS_ERR(prog))
 464                 return prog;
 465
 466         atomic_inc(&prog->aux->refcnt);
 467         fdput(f);
 468         return prog;
 469 }
 470 EXPORT_SYMBOL_GPL(bpf_prog_get);
 471
 472 /* last field in 'union bpf_attr' used by this command */
 473 #define BPF_PROG_LOAD_LAST_FIELD log_buf
 474
 475 static int bpf_prog_load(union bpf_attr *attr)
 476 {
 477         enum bpf_prog_type type = attr->prog_type;
 478         struct bpf_prog *prog;
 479         int err;
 480         char license[128];
 481         bool is_gpl;
 482
 483         if (CHECK_ATTR(BPF_PROG_LOAD))
 484                 return -EINVAL;
 485
 486         /* copy eBPF program license from user space */
 487         if (strncpy_from_user(license, u64_to_ptr(attr->license),
 488                               sizeof(license) - 1) < 0)
 489                 return -EFAULT;
 490         license[sizeof(license) - 1] = 0;
 491
 492         /* eBPF programs must be GPL compatible to use GPL-ed functions */
 493         is_gpl = license_is_gpl_compatible(license);
 494
 495         if (attr->insn_cnt >= BPF_MAXINSNS)
 496                 return -EINVAL;
 497
 498         /* plain bpf_prog allocation */
 499         prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 500         if (!prog)
 501                 return -ENOMEM;
 502
 503         prog->len = attr->insn_cnt;
 504
 505         err = -EFAULT;
 506         if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
 507                            prog->len * sizeof(struct bpf_insn)) != 0)
 508                 goto free_prog;
 509
 510         prog->orig_prog = NULL;
 511         prog->jited = false;
 512
 513         atomic_set(&prog->aux->refcnt, 1);
 514         prog->gpl_compatible = is_gpl;
 515
 516         /* find program type: socket_filter vs tracing_filter */
 517         err = find_prog_type(type, prog);
 518         if (err < 0)
 519                 goto free_prog;
 520
 521         /* run eBPF verifier */
 522         err = bpf_check(&prog, attr);
 523         if (err < 0)
 524                 goto free_used_maps;
 525
 526         /* fixup BPF_CALL->imm field */
 527         fixup_bpf_calls(prog);
 528
 529         /* eBPF program is ready to be JITed */
 530         bpf_prog_select_runtime(prog);
 531
 532         err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
 533         if (err < 0)
 534                 /* failed to allocate fd */
 535                 goto free_used_maps;
 536
 537         return err;
 538
 539 free_used_maps:
 540         free_used_maps(prog->aux);
 541 free_prog:
 542         bpf_prog_free(prog);
 543         return err;
 544 }
 545
 546 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 547 {
 548         union bpf_attr attr = {};
 549         int err;
 550
 551         /* the syscall is limited to root temporarily. This restriction will be
 552          * lifted when security audit is clean. Note that eBPF+tracing must have
 553          * this restriction, since it may pass kernel data to user space
 554          */
 555         if (!capable(CAP_SYS_ADMIN))
 556                 return -EPERM;
 557
 558         if (!access_ok(VERIFY_READ, uattr, 1))
 559                 return -EFAULT;
 560
 561         if (size > PAGE_SIZE)   /* silly large */
 562                 return -E2BIG;
 563
 564         /* If we're handed a bigger struct than we know of,
 565          * ensure all the unknown bits are 0 - i.e. new
 566          * user-space does not rely on any kernel feature
 567          * extensions we dont know about yet.
 568          */
 569         if (size > sizeof(attr)) {
 570                 unsigned char __user *addr;
 571                 unsigned char __user *end;
 572                 unsigned char val;
 573
 574                 addr = (void __user *)uattr + sizeof(attr);
 575                 end  = (void __user *)uattr + size;
 576
 577                 for (; addr < end; addr++) {
 578                         err = get_user(val, addr);
 579                         if (err)
 580                                 return err;
 581                         if (val)
 582                                 return -E2BIG;
 583                 }
 584                 size = sizeof(attr);
 585         }
 586
 587         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
 588         if (copy_from_user(&attr, uattr, size) != 0)
 589                 return -EFAULT;
 590
 591         switch (cmd) {
 592         case BPF_MAP_CREATE:
 593                 err = map_create(&attr);
 594                 break;
 595         case BPF_MAP_LOOKUP_ELEM:
 596                 err = map_lookup_elem(&attr);
 597                 break;
 598         case BPF_MAP_UPDATE_ELEM:
 599                 err = map_update_elem(&attr);
 600                 break;
 601         case BPF_MAP_DELETE_ELEM:
 602                 err = map_delete_elem(&attr);
 603                 break;
 604         case BPF_MAP_GET_NEXT_KEY:
 605                 err = map_get_next_key(&attr);
 606                 break;
 607         case BPF_PROG_LOAD:
 608                 err = bpf_prog_load(&attr);
 609                 break;
 610         default:
 611                 err = -EINVAL;
 612                 break;
 613         }
 614
 615         return err;
 616 }