Merge branch 'ebpf-next'
authorDavid S. Miller <davem@davemloft.net>
Sat, 6 Dec 2014 05:47:48 +0000 (21:47 -0800)
committerDavid S. Miller <davem@davemloft.net>
Sat, 6 Dec 2014 05:47:48 +0000 (21:47 -0800)
Alexei Starovoitov says:

====================
allow eBPF programs to be attached to sockets

V1->V2:

fixed comments in sample code to state clearly that packet data is accessed
with LD_ABS instructions and not internal skb fields.
Also replaced constants in:
BPF_LD_ABS(BPF_B, 14 + 9 /* R0 = ip->proto */),
with:
BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */),

V1 cover:

Introduce BPF_PROG_TYPE_SOCKET_FILTER type of eBPF programs that can be
attached to sockets with setsockopt().
Allow such programs to access maps via lookup/update/delete helpers.

This feature was previewed by bpf manpage in commit b4fc1a460f30("Merge branch 'bpf-next'")
Now it can actually run.

1st patch adds LD_ABS/LD_IND instruction verification and
2nd patch adds new setsockopt() flag.
Patches 3-6 are examples in assembler and in C.

Though native eBPF programs are way more powerful than classic filters
(attachable through similar setsockopt() call), they don't have skb field
accessors yet. Like skb->pkt_type, skb->dev->ifindex are not accessible.
There are sevaral ways to achieve that. That will be in the next set of patches.
So in this set native eBPF programs can only read data from packet and
access maps.

The most powerful example is sockex2_kern.c from patch 6 where ~200 lines of C
are compiled into ~300 of eBPF instructions.
It shows how quite complex packet parsing can be done.

LLVM used to build examples is at https://github.com/iovisor/llvm
which is fork of llvm trunk that I'm cleaning up for upstreaming.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
31 files changed:
arch/alpha/include/uapi/asm/socket.h
arch/avr32/include/uapi/asm/socket.h
arch/cris/include/uapi/asm/socket.h
arch/frv/include/uapi/asm/socket.h
arch/ia64/include/uapi/asm/socket.h
arch/m32r/include/uapi/asm/socket.h
arch/mips/include/uapi/asm/socket.h
arch/mn10300/include/uapi/asm/socket.h
arch/parisc/include/uapi/asm/socket.h
arch/powerpc/include/uapi/asm/socket.h
arch/s390/include/uapi/asm/socket.h
arch/sparc/include/uapi/asm/socket.h
arch/xtensa/include/uapi/asm/socket.h
include/linux/bpf.h
include/linux/filter.h
include/uapi/asm-generic/socket.h
include/uapi/linux/bpf.h
kernel/bpf/verifier.c
net/core/filter.c
net/core/sock.c
samples/bpf/Makefile
samples/bpf/bpf_helpers.h [new file with mode: 0644]
samples/bpf/bpf_load.c [new file with mode: 0644]
samples/bpf/bpf_load.h [new file with mode: 0644]
samples/bpf/libbpf.c
samples/bpf/libbpf.h
samples/bpf/sock_example.c [new file with mode: 0644]
samples/bpf/sockex1_kern.c [new file with mode: 0644]
samples/bpf/sockex1_user.c [new file with mode: 0644]
samples/bpf/sockex2_kern.c [new file with mode: 0644]
samples/bpf/sockex2_user.c [new file with mode: 0644]

index e2fe070..9a20821 100644 (file)
@@ -89,4 +89,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _UAPI_ASM_SOCKET_H */
index 92121b0..2b65ed6 100644 (file)
@@ -82,4 +82,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
index 60f60f5..e2503d9 100644 (file)
@@ -84,6 +84,9 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _ASM_SOCKET_H */
 
 
index 2c68902..4823ad1 100644 (file)
@@ -82,5 +82,8 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _ASM_SOCKET_H */
 
index 09a93fb..59be3d8 100644 (file)
@@ -91,4 +91,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _ASM_IA64_SOCKET_H */
index e858981..7bc4cb2 100644 (file)
@@ -82,4 +82,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _ASM_M32R_SOCKET_H */
index 2e9ee8c..dec3c85 100644 (file)
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _UAPI_ASM_SOCKET_H */
index f3492e8..cab7d6d 100644 (file)
@@ -82,4 +82,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _ASM_SOCKET_H */
index 7984a1c..a5cd40c 100644 (file)
@@ -81,4 +81,7 @@
 
 #define SO_INCOMING_CPU                0x402A
 
+#define SO_ATTACH_BPF          0x402B
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _UAPI_ASM_SOCKET_H */
index 3474e4e..c046666 100644 (file)
@@ -89,4 +89,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _ASM_POWERPC_SOCKET_H */
index 8457636..296942d 100644 (file)
@@ -88,4 +88,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _ASM_SOCKET_H */
index 4a8003a..e6a16c4 100644 (file)
@@ -78,6 +78,9 @@
 
 #define SO_INCOMING_CPU                0x0033
 
+#define SO_ATTACH_BPF          0x0034
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION             0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
index c46f6a6..4120af0 100644 (file)
@@ -93,4 +93,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* _XTENSA_SOCKET_H */
index 75e94ea..bbfceb7 100644 (file)
@@ -128,7 +128,11 @@ struct bpf_prog_aux {
        struct work_struct work;
 };
 
+#ifdef CONFIG_BPF_SYSCALL
 void bpf_prog_put(struct bpf_prog *prog);
+#else
+static inline void bpf_prog_put(struct bpf_prog *prog) {}
+#endif
 struct bpf_prog *bpf_prog_get(u32 ufd);
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
index ca95abd..caac208 100644 (file)
@@ -381,6 +381,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
 void bpf_prog_destroy(struct bpf_prog *fp);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
+int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
 
 int bpf_check_classic(const struct sock_filter *filter, unsigned int flen);
index f541cce..5c15c2a 100644 (file)
@@ -84,4 +84,7 @@
 
 #define SO_INCOMING_CPU                49
 
+#define SO_ATTACH_BPF          50
+#define SO_DETACH_BPF          SO_DETACH_FILTER
+
 #endif /* __ASM_GENERIC_SOCKET_H */
index 4a3d0f8..45da7ec 100644 (file)
@@ -117,6 +117,7 @@ enum bpf_map_type {
 
 enum bpf_prog_type {
        BPF_PROG_TYPE_UNSPEC,
+       BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
index b6a1f7c..a28e09c 100644 (file)
@@ -1172,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
        return 0;
 }
 
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ *   preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ *   ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ *   SRC == any register
+ *   IMM == 32-bit immediate
+ *
+ * Output:
+ *   R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+       struct reg_state *regs = env->cur_state.regs;
+       u8 mode = BPF_MODE(insn->code);
+       struct reg_state *reg;
+       int i, err;
+
+       if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+               verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+               return -EINVAL;
+       }
+
+       if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+           (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+               verbose("BPF_LD_ABS uses reserved fields\n");
+               return -EINVAL;
+       }
+
+       /* check whether implicit source operand (register R6) is readable */
+       err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+       if (err)
+               return err;
+
+       if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+               verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+               return -EINVAL;
+       }
+
+       if (mode == BPF_IND) {
+               /* check explicit source operand */
+               err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+               if (err)
+                       return err;
+       }
+
+       /* reset caller saved regs to unreadable */
+       for (i = 0; i < CALLER_SAVED_REGS; i++) {
+               reg = regs + caller_saved[i];
+               reg->type = NOT_INIT;
+               reg->imm = 0;
+       }
+
+       /* mark destination R0 register as readable, since it contains
+        * the value fetched from the packet
+        */
+       regs[BPF_REG_0].type = UNKNOWN_VALUE;
+       return 0;
+}
+
 /* non-recursive DFS pseudo code
  * 1  procedure DFS-iterative(G,v):
  * 2      label v as discovered
@@ -1677,8 +1741,10 @@ process_bpf_exit:
                        u8 mode = BPF_MODE(insn->code);
 
                        if (mode == BPF_ABS || mode == BPF_IND) {
-                               verbose("LD_ABS is not supported yet\n");
-                               return -EINVAL;
+                               err = check_ld_abs(env, insn);
+                               if (err)
+                                       return err;
+
                        } else if (mode == BPF_IMM) {
                                err = check_ld_imm(env, insn);
                                if (err)
index 647b122..8cc3c03 100644 (file)
@@ -44,6 +44,7 @@
 #include <linux/ratelimit.h>
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
+#include <linux/bpf.h>
 
 /**
  *     sk_filter - run a packet through a socket filter
@@ -813,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
 
 static void __bpf_prog_release(struct bpf_prog *prog)
 {
-       bpf_release_orig_filter(prog);
-       bpf_prog_free(prog);
+       if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+               bpf_prog_put(prog);
+       } else {
+               bpf_release_orig_filter(prog);
+               bpf_prog_free(prog);
+       }
 }
 
 static void __sk_filter_release(struct sk_filter *fp)
@@ -1088,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_attach_filter);
 
+#ifdef CONFIG_BPF_SYSCALL
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+       struct sk_filter *fp, *old_fp;
+       struct bpf_prog *prog;
+
+       if (sock_flag(sk, SOCK_FILTER_LOCKED))
+               return -EPERM;
+
+       prog = bpf_prog_get(ufd);
+       if (!prog)
+               return -EINVAL;
+
+       if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+               /* valid fd, but invalid program type */
+               bpf_prog_put(prog);
+               return -EINVAL;
+       }
+
+       fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+       if (!fp) {
+               bpf_prog_put(prog);
+               return -ENOMEM;
+       }
+       fp->prog = prog;
+
+       atomic_set(&fp->refcnt, 0);
+
+       if (!sk_filter_charge(sk, fp)) {
+               __sk_filter_release(fp);
+               return -ENOMEM;
+       }
+
+       old_fp = rcu_dereference_protected(sk->sk_filter,
+                                          sock_owned_by_user(sk));
+       rcu_assign_pointer(sk->sk_filter, fp);
+
+       if (old_fp)
+               sk_filter_uncharge(sk, old_fp);
+
+       return 0;
+}
+
+/* allow socket filters to call
+ * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
+ */
+static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id)
+{
+       switch (func_id) {
+       case BPF_FUNC_map_lookup_elem:
+               return &bpf_map_lookup_elem_proto;
+       case BPF_FUNC_map_update_elem:
+               return &bpf_map_update_elem_proto;
+       case BPF_FUNC_map_delete_elem:
+               return &bpf_map_delete_elem_proto;
+       default:
+               return NULL;
+       }
+}
+
+static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+       /* skb fields cannot be accessed yet */
+       return false;
+}
+
+static struct bpf_verifier_ops sock_filter_ops = {
+       .get_func_proto = sock_filter_func_proto,
+       .is_valid_access = sock_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+       .ops = &sock_filter_ops,
+       .type = BPF_PROG_TYPE_SOCKET_FILTER,
+};
+
+static int __init register_sock_filter_ops(void)
+{
+       bpf_register_prog_type(&tl);
+       return 0;
+}
+late_initcall(register_sock_filter_ops);
+#else
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+       return -EOPNOTSUPP;
+}
+#endif
 int sk_detach_filter(struct sock *sk)
 {
        int ret = -ENOENT;
index 0725cf0..9a56b20 100644 (file)
@@ -888,6 +888,19 @@ set_rcvbuf:
                }
                break;
 
+       case SO_ATTACH_BPF:
+               ret = -EINVAL;
+               if (optlen == sizeof(u32)) {
+                       u32 ufd;
+
+                       ret = -EFAULT;
+                       if (copy_from_user(&ufd, optval, sizeof(ufd)))
+                               break;
+
+                       ret = sk_attach_bpf(ufd, sk);
+               }
+               break;
+
        case SO_DETACH_FILTER:
                ret = sk_detach_filter(sk);
                break;
index 0718d9c..b5b3600 100644 (file)
@@ -3,11 +3,31 @@ obj- := dummy.o
 
 # List of programs to build
 hostprogs-y := test_verifier test_maps
+hostprogs-y += sock_example
+hostprogs-y += sockex1
+hostprogs-y += sockex2
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
+sock_example-objs := sock_example.o libbpf.o
+sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
+sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
+always += sockex1_kern.o
+always += sockex2_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
+
+HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
+HOSTLOADLIBES_sockex1 += -lelf
+HOSTLOADLIBES_sockex2 += -lelf
+
+# point this to your LLVM backend with bpf support
+LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
+
+%.o: %.c
+       clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
+               -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
+               -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
new file mode 100644 (file)
index 0000000..ca03331
--- /dev/null
@@ -0,0 +1,40 @@
+#ifndef __BPF_HELPERS_H
+#define __BPF_HELPERS_H
+
+/* helper macro to place programs, maps, license in
+ * different sections in elf_bpf file. Section names
+ * are interpreted by elf_bpf loader
+ */
+#define SEC(NAME) __attribute__((section(NAME), used))
+
+/* helper functions called from eBPF programs written in C */
+static void *(*bpf_map_lookup_elem)(void *map, void *key) =
+       (void *) BPF_FUNC_map_lookup_elem;
+static int (*bpf_map_update_elem)(void *map, void *key, void *value,
+                                 unsigned long long flags) =
+       (void *) BPF_FUNC_map_update_elem;
+static int (*bpf_map_delete_elem)(void *map, void *key) =
+       (void *) BPF_FUNC_map_delete_elem;
+
+/* llvm builtin functions that eBPF C program may use to
+ * emit BPF_LD_ABS and BPF_LD_IND instructions
+ */
+struct sk_buff;
+unsigned long long load_byte(void *skb,
+                            unsigned long long off) asm("llvm.bpf.load.byte");
+unsigned long long load_half(void *skb,
+                            unsigned long long off) asm("llvm.bpf.load.half");
+unsigned long long load_word(void *skb,
+                            unsigned long long off) asm("llvm.bpf.load.word");
+
+/* a helper structure used by eBPF C program
+ * to describe map attributes to elf_bpf loader
+ */
+struct bpf_map_def {
+       unsigned int type;
+       unsigned int key_size;
+       unsigned int value_size;
+       unsigned int max_entries;
+};
+
+#endif
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
new file mode 100644 (file)
index 0000000..1831d23
--- /dev/null
@@ -0,0 +1,203 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <libelf.h>
+#include <gelf.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include "libbpf.h"
+#include "bpf_helpers.h"
+#include "bpf_load.h"
+
+static char license[128];
+static bool processed_sec[128];
+int map_fd[MAX_MAPS];
+int prog_fd[MAX_PROGS];
+int prog_cnt;
+
+static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
+{
+       int fd;
+       bool is_socket = strncmp(event, "socket", 6) == 0;
+
+       if (!is_socket)
+               /* tracing events tbd */
+               return -1;
+
+       fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
+                          prog, size, license);
+
+       if (fd < 0) {
+               printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
+               return -1;
+       }
+
+       prog_fd[prog_cnt++] = fd;
+
+       return 0;
+}
+
+static int load_maps(struct bpf_map_def *maps, int len)
+{
+       int i;
+
+       for (i = 0; i < len / sizeof(struct bpf_map_def); i++) {
+
+               map_fd[i] = bpf_create_map(maps[i].type,
+                                          maps[i].key_size,
+                                          maps[i].value_size,
+                                          maps[i].max_entries);
+               if (map_fd[i] < 0)
+                       return 1;
+       }
+       return 0;
+}
+
+static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
+                  GElf_Shdr *shdr, Elf_Data **data)
+{
+       Elf_Scn *scn;
+
+       scn = elf_getscn(elf, i);
+       if (!scn)
+               return 1;
+
+       if (gelf_getshdr(scn, shdr) != shdr)
+               return 2;
+
+       *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+       if (!*shname || !shdr->sh_size)
+               return 3;
+
+       *data = elf_getdata(scn, 0);
+       if (!*data || elf_getdata(scn, *data) != NULL)
+               return 4;
+
+       return 0;
+}
+
+static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
+                               GElf_Shdr *shdr, struct bpf_insn *insn)
+{
+       int i, nrels;
+
+       nrels = shdr->sh_size / shdr->sh_entsize;
+
+       for (i = 0; i < nrels; i++) {
+               GElf_Sym sym;
+               GElf_Rel rel;
+               unsigned int insn_idx;
+
+               gelf_getrel(data, i, &rel);
+
+               insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+
+               gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
+
+               if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
+                       printf("invalid relo for insn[%d].code 0x%x\n",
+                              insn_idx, insn[insn_idx].code);
+                       return 1;
+               }
+               insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
+               insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)];
+       }
+
+       return 0;
+}
+
+int load_bpf_file(char *path)
+{
+       int fd, i;
+       Elf *elf;
+       GElf_Ehdr ehdr;
+       GElf_Shdr shdr, shdr_prog;
+       Elf_Data *data, *data_prog, *symbols = NULL;
+       char *shname, *shname_prog;
+
+       if (elf_version(EV_CURRENT) == EV_NONE)
+               return 1;
+
+       fd = open(path, O_RDONLY, 0);
+       if (fd < 0)
+               return 1;
+
+       elf = elf_begin(fd, ELF_C_READ, NULL);
+
+       if (!elf)
+               return 1;
+
+       if (gelf_getehdr(elf, &ehdr) != &ehdr)
+               return 1;
+
+       /* scan over all elf sections to get license and map info */
+       for (i = 1; i < ehdr.e_shnum; i++) {
+
+               if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+                       continue;
+
+               if (0) /* helpful for llvm debugging */
+                       printf("section %d:%s data %p size %zd link %d flags %d\n",
+                              i, shname, data->d_buf, data->d_size,
+                              shdr.sh_link, (int) shdr.sh_flags);
+
+               if (strcmp(shname, "license") == 0) {
+                       processed_sec[i] = true;
+                       memcpy(license, data->d_buf, data->d_size);
+               } else if (strcmp(shname, "maps") == 0) {
+                       processed_sec[i] = true;
+                       if (load_maps(data->d_buf, data->d_size))
+                               return 1;
+               } else if (shdr.sh_type == SHT_SYMTAB) {
+                       symbols = data;
+               }
+       }
+
+       /* load programs that need map fixup (relocations) */
+       for (i = 1; i < ehdr.e_shnum; i++) {
+
+               if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+                       continue;
+               if (shdr.sh_type == SHT_REL) {
+                       struct bpf_insn *insns;
+
+                       if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
+                                   &shdr_prog, &data_prog))
+                               continue;
+
+                       insns = (struct bpf_insn *) data_prog->d_buf;
+
+                       processed_sec[shdr.sh_info] = true;
+                       processed_sec[i] = true;
+
+                       if (parse_relo_and_apply(data, symbols, &shdr, insns))
+                               continue;
+
+                       if (memcmp(shname_prog, "events/", 7) == 0 ||
+                           memcmp(shname_prog, "socket", 6) == 0)
+                               load_and_attach(shname_prog, insns, data_prog->d_size);
+               }
+       }
+
+       /* load programs that don't use maps */
+       for (i = 1; i < ehdr.e_shnum; i++) {
+
+               if (processed_sec[i])
+                       continue;
+
+               if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+                       continue;
+
+               if (memcmp(shname, "events/", 7) == 0 ||
+                   memcmp(shname, "socket", 6) == 0)
+                       load_and_attach(shname, data->d_buf, data->d_size);
+       }
+
+       close(fd);
+       return 0;
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
new file mode 100644 (file)
index 0000000..27789a3
--- /dev/null
@@ -0,0 +1,24 @@
+#ifndef __BPF_LOAD_H
+#define __BPF_LOAD_H
+
+#define MAX_MAPS 32
+#define MAX_PROGS 32
+
+extern int map_fd[MAX_MAPS];
+extern int prog_fd[MAX_PROGS];
+
+/* parses elf file compiled by llvm .c->.o
+ * . parses 'maps' section and creates maps via BPF syscall
+ * . parses 'license' section and passes it to syscall
+ * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by
+ *   storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD
+ * . loads eBPF programs via BPF syscall
+ *
+ * One ELF file can contain multiple BPF programs which will be loaded
+ * and their FDs stored stored in prog_fd array
+ *
+ * returns zero on success
+ */
+int load_bpf_file(char *path);
+
+#endif
index 17bb520..46d50b7 100644 (file)
@@ -7,6 +7,10 @@
 #include <linux/netlink.h>
 #include <linux/bpf.h>
 #include <errno.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <linux/if_packet.h>
+#include <arpa/inet.h>
 #include "libbpf.h"
 
 static __u64 ptr_to_u64(void *ptr)
@@ -93,3 +97,27 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 
        return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
 }
+
+int open_raw_sock(const char *name)
+{
+       struct sockaddr_ll sll;
+       int sock;
+
+       sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
+       if (sock < 0) {
+               printf("cannot create raw socket\n");
+               return -1;
+       }
+
+       memset(&sll, 0, sizeof(sll));
+       sll.sll_family = AF_PACKET;
+       sll.sll_ifindex = if_nametoindex(name);
+       sll.sll_protocol = htons(ETH_P_ALL);
+       if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
+               printf("bind to %s: %s\n", name, strerror(errno));
+               close(sock);
+               return -1;
+       }
+
+       return sock;
+}
index f8678e5..58c5fe1 100644 (file)
@@ -15,7 +15,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
                  const struct bpf_insn *insns, int insn_len,
                  const char *license);
 
-#define LOG_BUF_SIZE 8192
+#define LOG_BUF_SIZE 65536
 extern char bpf_log_buf[LOG_BUF_SIZE];
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -99,6 +99,16 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
        BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
 
 
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM)                                  \
+       ((struct bpf_insn) {                                    \
+               .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,     \
+               .dst_reg = 0,                                   \
+               .src_reg = 0,                                   \
+               .off   = 0,                                     \
+               .imm   = IMM })
+
 /* Memory load, dst_reg = *(uint *) (src_reg + off16) */
 
 #define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                       \
@@ -169,4 +179,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
                .off   = 0,                                     \
                .imm   = 0 })
 
+/* create RAW socket and bind to interface 'name' */
+int open_raw_sock(const char *name);
+
 #endif
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
new file mode 100644 (file)
index 0000000..c8ad040
--- /dev/null
@@ -0,0 +1,101 @@
+/* eBPF example program:
+ * - creates arraymap in kernel with key 4 bytes and value 8 bytes
+ *
+ * - loads eBPF program:
+ *   r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)];
+ *   *(u32*)(fp - 4) = r0;
+ *   // assuming packet is IPv4, lookup ip->proto in a map
+ *   value = bpf_map_lookup_elem(map_fd, fp - 4);
+ *   if (value)
+ *        (*(u64*)value) += 1;
+ *
+ * - attaches this program to eth0 raw socket
+ *
+ * - every second user space reads map[tcp], map[udp], map[icmp] to see
+ *   how many packets of given protocol were seen on eth0
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <stddef.h>
+#include "libbpf.h"
+
+static int test_sock(void)
+{
+       int sock = -1, map_fd, prog_fd, i, key;
+       long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;
+
+       map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
+                               256);
+       if (map_fd < 0) {
+               printf("failed to create map '%s'\n", strerror(errno));
+               goto cleanup;
+       }
+
+       struct bpf_insn prog[] = {
+               BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+               BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */),
+               BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+               BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+               BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+               BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+               BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+               BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+               BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+               BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+               BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+               BPF_EXIT_INSN(),
+       };
+
+       prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
+                               "GPL");
+       if (prog_fd < 0) {
+               printf("failed to load prog '%s'\n", strerror(errno));
+               goto cleanup;
+       }
+
+       sock = open_raw_sock("lo");
+
+       if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+                      sizeof(prog_fd)) < 0) {
+               printf("setsockopt %s\n", strerror(errno));
+               goto cleanup;
+       }
+
+       for (i = 0; i < 10; i++) {
+               key = IPPROTO_TCP;
+               assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
+
+               key = IPPROTO_UDP;
+               assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0);
+
+               key = IPPROTO_ICMP;
+               assert(bpf_lookup_elem(map_fd, &key, &icmp_cnt) == 0);
+
+               printf("TCP %lld UDP %lld ICMP %lld packets\n",
+                      tcp_cnt, udp_cnt, icmp_cnt);
+               sleep(1);
+       }
+
+cleanup:
+       /* maps, programs, raw sockets will auto cleanup on process exit */
+       return 0;
+}
+
+int main(void)
+{
+       FILE *f;
+
+       f = popen("ping -c5 localhost", "r");
+       (void)f;
+
+       return test_sock();
+}
diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c
new file mode 100644 (file)
index 0000000..0668926
--- /dev/null
@@ -0,0 +1,25 @@
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(u32),
+       .value_size = sizeof(long),
+       .max_entries = 256,
+};
+
+SEC("socket1")
+int bpf_prog1(struct sk_buff *skb)
+{
+       int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
+       long *value;
+
+       value = bpf_map_lookup_elem(&my_map, &index);
+       if (value)
+               __sync_fetch_and_add(value, 1);
+
+       return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
new file mode 100644 (file)
index 0000000..34a443f
--- /dev/null
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+
+int main(int ac, char **argv)
+{
+       char filename[256];
+       FILE *f;
+       int i, sock;
+
+       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+       if (load_bpf_file(filename)) {
+               printf("%s", bpf_log_buf);
+               return 1;
+       }
+
+       sock = open_raw_sock("lo");
+
+       assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
+                         sizeof(prog_fd[0])) == 0);
+
+       f = popen("ping -c5 localhost", "r");
+       (void) f;
+
+       for (i = 0; i < 5; i++) {
+               long long tcp_cnt, udp_cnt, icmp_cnt;
+               int key;
+
+               key = IPPROTO_TCP;
+               assert(bpf_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);
+
+               key = IPPROTO_UDP;
+               assert(bpf_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);
+
+               key = IPPROTO_ICMP;
+               assert(bpf_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);
+
+               printf("TCP %lld UDP %lld ICMP %lld packets\n",
+                      tcp_cnt, udp_cnt, icmp_cnt);
+               sleep(1);
+       }
+
+       return 0;
+}
diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c
new file mode 100644 (file)
index 0000000..6f0135f
--- /dev/null
@@ -0,0 +1,215 @@
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+#include <uapi/linux/in.h>
+#include <uapi/linux/if.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/if_tunnel.h>
+#define IP_MF          0x2000
+#define IP_OFFSET      0x1FFF
+
+struct vlan_hdr {
+       __be16 h_vlan_TCI;
+       __be16 h_vlan_encapsulated_proto;
+};
+
+struct flow_keys {
+       __be32 src;
+       __be32 dst;
+       union {
+               __be32 ports;
+               __be16 port16[2];
+       };
+       __u16 thoff;
+       __u8 ip_proto;
+};
+
+static inline int proto_ports_offset(__u64 proto)
+{
+       switch (proto) {
+       case IPPROTO_TCP:
+       case IPPROTO_UDP:
+       case IPPROTO_DCCP:
+       case IPPROTO_ESP:
+       case IPPROTO_SCTP:
+       case IPPROTO_UDPLITE:
+               return 0;
+       case IPPROTO_AH:
+               return 4;
+       default:
+               return 0;
+       }
+}
+
+static inline int ip_is_fragment(struct sk_buff *ctx, __u64 nhoff)
+{
+       return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+               & (IP_MF | IP_OFFSET);
+}
+
+static inline __u32 ipv6_addr_hash(struct sk_buff *ctx, __u64 off)
+{
+       __u64 w0 = load_word(ctx, off);
+       __u64 w1 = load_word(ctx, off + 4);
+       __u64 w2 = load_word(ctx, off + 8);
+       __u64 w3 = load_word(ctx, off + 12);
+
+       return (__u32)(w0 ^ w1 ^ w2 ^ w3);
+}
+
+static inline __u64 parse_ip(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
+                            struct flow_keys *flow)
+{
+       __u64 verlen;
+
+       if (unlikely(ip_is_fragment(skb, nhoff)))
+               *ip_proto = 0;
+       else
+               *ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
+
+       if (*ip_proto != IPPROTO_GRE) {
+               flow->src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
+               flow->dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
+       }
+
+       verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
+       if (likely(verlen == 0x45))
+               nhoff += 20;
+       else
+               nhoff += (verlen & 0xF) << 2;
+
+       return nhoff;
+}
+
+static inline __u64 parse_ipv6(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
+                              struct flow_keys *flow)
+{
+       *ip_proto = load_byte(skb,
+                             nhoff + offsetof(struct ipv6hdr, nexthdr));
+       flow->src = ipv6_addr_hash(skb,
+                                  nhoff + offsetof(struct ipv6hdr, saddr));
+       flow->dst = ipv6_addr_hash(skb,
+                                  nhoff + offsetof(struct ipv6hdr, daddr));
+       nhoff += sizeof(struct ipv6hdr);
+
+       return nhoff;
+}
+
+static inline bool flow_dissector(struct sk_buff *skb, struct flow_keys *flow)
+{
+       __u64 nhoff = ETH_HLEN;
+       __u64 ip_proto;
+       __u64 proto = load_half(skb, 12);
+       int poff;
+
+       if (proto == ETH_P_8021AD) {
+               proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+                                                       h_vlan_encapsulated_proto));
+               nhoff += sizeof(struct vlan_hdr);
+       }
+
+       if (proto == ETH_P_8021Q) {
+               proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+                                                       h_vlan_encapsulated_proto));
+               nhoff += sizeof(struct vlan_hdr);
+       }
+
+       if (likely(proto == ETH_P_IP))
+               nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+       else if (proto == ETH_P_IPV6)
+               nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+       else
+               return false;
+
+       switch (ip_proto) {
+       case IPPROTO_GRE: {
+               struct gre_hdr {
+                       __be16 flags;
+                       __be16 proto;
+               };
+
+               __u64 gre_flags = load_half(skb,
+                                           nhoff + offsetof(struct gre_hdr, flags));
+               __u64 gre_proto = load_half(skb,
+                                           nhoff + offsetof(struct gre_hdr, proto));
+
+               if (gre_flags & (GRE_VERSION|GRE_ROUTING))
+                       break;
+
+               proto = gre_proto;
+               nhoff += 4;
+               if (gre_flags & GRE_CSUM)
+                       nhoff += 4;
+               if (gre_flags & GRE_KEY)
+                       nhoff += 4;
+               if (gre_flags & GRE_SEQ)
+                       nhoff += 4;
+
+               if (proto == ETH_P_8021Q) {
+                       proto = load_half(skb,
+                                         nhoff + offsetof(struct vlan_hdr,
+                                                          h_vlan_encapsulated_proto));
+                       nhoff += sizeof(struct vlan_hdr);
+               }
+
+               if (proto == ETH_P_IP)
+                       nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+               else if (proto == ETH_P_IPV6)
+                       nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+               else
+                       return false;
+               break;
+       }
+       case IPPROTO_IPIP:
+               nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+               break;
+       case IPPROTO_IPV6:
+               nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+               break;
+       default:
+               break;
+       }
+
+       flow->ip_proto = ip_proto;
+       poff = proto_ports_offset(ip_proto);
+       if (poff >= 0) {
+               nhoff += poff;
+               flow->ports = load_word(skb, nhoff);
+       }
+
+       flow->thoff = (__u16) nhoff;
+
+       return true;
+}
+
+struct bpf_map_def SEC("maps") hash_map = {
+       .type = BPF_MAP_TYPE_HASH,
+       .key_size = sizeof(__be32),
+       .value_size = sizeof(long),
+       .max_entries = 1024,
+};
+
+SEC("socket2")
+int bpf_prog2(struct sk_buff *skb)
+{
+       struct flow_keys flow;
+       long *value;
+       u32 key;
+
+       if (!flow_dissector(skb, &flow))
+               return 0;
+
+       key = flow.dst;
+       value = bpf_map_lookup_elem(&hash_map, &key);
+       if (value) {
+               __sync_fetch_and_add(value, 1);
+       } else {
+               long val = 1;
+
+               bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
+       }
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
new file mode 100644 (file)
index 0000000..d2d5f5a
--- /dev/null
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+
+int main(int ac, char **argv)
+{
+       char filename[256];
+       FILE *f;
+       int i, sock;
+
+       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+       if (load_bpf_file(filename)) {
+               printf("%s", bpf_log_buf);
+               return 1;
+       }
+
+       sock = open_raw_sock("lo");
+
+       assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
+                         sizeof(prog_fd[0])) == 0);
+
+       f = popen("ping -c5 localhost", "r");
+       (void) f;
+
+       for (i = 0; i < 5; i++) {
+               int key = 0, next_key;
+               long long value;
+
+               while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+                       bpf_lookup_elem(map_fd[0], &next_key, &value);
+                       printf("ip %s count %lld\n",
+                              inet_ntoa((struct in_addr){htonl(next_key)}),
+                              value);
+                       key = next_key;
+               }
+               sleep(1);
+       }
+       return 0;
+}