The patch introduces BPF_MAP_TYPE_STRUCT_OPS. The map value
is a kernel struct with its func ptr implemented in bpf prog.
This new map is the interface to register/unregister/introspect
a bpf implemented kernel struct.
The kernel struct is actually embedded inside another new struct
(or called the "value" struct in the code). For example,
"struct tcp_congestion_ops" is embbeded in:
struct bpf_struct_ops_tcp_congestion_ops {
refcount_t refcnt;
enum bpf_struct_ops_state state;
struct tcp_congestion_ops data; /* <-- kernel subsystem struct here */
}
The map value is "struct bpf_struct_ops_tcp_congestion_ops".
The "bpftool map dump" will then be able to show the
state ("inuse"/"tobefree") and the number of subsystem's refcnt (e.g.
number of tcp_sock in the tcp_congestion_ops case). This "value" struct
is created automatically by a macro. Having a separate "value" struct
will also make extending "struct bpf_struct_ops_XYZ" easier (e.g. adding
"void (*init)(void)" to "struct bpf_struct_ops_XYZ" to do some
initialization works before registering the struct_ops to the kernel
subsystem). The libbpf will take care of finding and populating the
"struct bpf_struct_ops_XYZ" from "struct XYZ".
Register a struct_ops to a kernel subsystem:
1. Load all needed BPF_PROG_TYPE_STRUCT_OPS prog(s)
2. Create a BPF_MAP_TYPE_STRUCT_OPS with attr->btf_vmlinux_value_type_id
set to the btf id "struct bpf_struct_ops_tcp_congestion_ops" of the
running kernel.
Instead of reusing the attr->btf_value_type_id,
btf_vmlinux_value_type_id s added such that attr->btf_fd can still be
used as the "user" btf which could store other useful sysadmin/debug
info that may be introduced in the furture,
e.g. creation-date/compiler-details/map-creator...etc.
3. Create a "struct bpf_struct_ops_tcp_congestion_ops" object as described
in the running kernel btf. Populate the value of this object.
The function ptr should be populated with the prog fds.
4. Call BPF_MAP_UPDATE with the object created in (3) as
the map value. The key is always "0".
During BPF_MAP_UPDATE, the code that saves the kernel-func-ptr's
args as an array of u64 is generated. BPF_MAP_UPDATE also allows
the specific struct_ops to do some final checks in "st_ops->init_member()"
(e.g. ensure all mandatory func ptrs are implemented).
If everything looks good, it will register this kernel struct
to the kernel subsystem. The map will not allow further update
from this point.
Unregister a struct_ops from the kernel subsystem:
BPF_MAP_DELETE with key "0".
Introspect a struct_ops:
BPF_MAP_LOOKUP_ELEM with key "0". The map value returned will
have the prog _id_ populated as the func ptr.
The map value state (enum bpf_struct_ops_state) will transit from:
INIT (map created) =>
INUSE (map updated, i.e. reg) =>
TOBEFREE (map value deleted, i.e. unreg)
The kernel subsystem needs to call bpf_struct_ops_get() and
bpf_struct_ops_put() to manage the "refcnt" in the
"struct bpf_struct_ops_XYZ". This patch uses a separate refcnt
for the purose of tracking the subsystem usage. Another approach
is to reuse the map->refcnt and then "show" (i.e. during map_lookup)
the subsystem's usage by doing map->refcnt - map->usercnt to filter out
the map-fd/pinned-map usage. However, that will also tie down the
future semantics of map->refcnt and map->usercnt.
The very first subsystem's refcnt (during reg()) holds one
count to map->refcnt. When the very last subsystem's refcnt
is gone, it will also release the map->refcnt. All bpf_prog will be
freed when the map->refcnt reaches 0 (i.e. during map_free()).
Here is how the bpftool map command will look like:
[root@arch-fb-vm1 bpf]# bpftool map show
6: struct_ops name dctcp flags 0x0
key 4B value 256B max_entries 1 memlock 4096B
btf_id 6
[root@arch-fb-vm1 bpf]# bpftool map dump id 6
[{
"value": {
"refcnt": {
"refs": {
"counter": 1
}
},
"state": 1,
"data": {
"list": {
"next": 0,
"prev": 0
},
"key": 0,
"flags": 2,
"init": 24,
"release": 0,
"ssthresh": 25,
"cong_avoid": 30,
"set_state": 27,
"cwnd_event": 28,
"in_ack_event": 26,
"undo_cwnd": 29,
"pkts_acked": 0,
"min_tso_segs": 0,
"sndbuf_expand": 0,
"cong_control": 0,
"get_info": 0,
"name": [98,112,102,95,100,99,116,99,112,0,0,0,0,0,0,0
],
"owner": 0
}
}
}
]
Misc Notes:
* bpf_struct_ops_map_sys_lookup_elem() is added for syscall lookup.
It does an inplace update on "*value" instead returning a pointer
to syscall.c. Otherwise, it needs a separate copy of "zero" value
for the BPF_STRUCT_OPS_STATE_INIT to avoid races.
* The bpf_struct_ops_map_delete_elem() is also called without
preempt_disable() from map_delete_elem(). It is because
the "->unreg()" may requires sleepable context, e.g.
the "tcp_unregister_congestion_control()".
* "const" is added to some of the existing "struct btf_func_model *"
function arg to avoid a compiler warning caused by this patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200109003505.3855919-1-kafai@fb.com
153 lines
4.6 KiB
C
153 lines
4.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/* Copyright (c) 2018 Facebook */
|
|
|
|
#ifndef _LINUX_BTF_H
|
|
#define _LINUX_BTF_H 1
|
|
|
|
#include <linux/types.h>
|
|
#include <uapi/linux/btf.h>
|
|
|
|
#define BTF_TYPE_EMIT(type) ((void)(type *)0)
|
|
|
|
struct btf;
|
|
struct btf_member;
|
|
struct btf_type;
|
|
union bpf_attr;
|
|
|
|
extern const struct file_operations btf_fops;
|
|
|
|
void btf_put(struct btf *btf);
|
|
int btf_new_fd(const union bpf_attr *attr);
|
|
struct btf *btf_get_by_fd(int fd);
|
|
int btf_get_info_by_fd(const struct btf *btf,
|
|
const union bpf_attr *attr,
|
|
union bpf_attr __user *uattr);
|
|
/* Figure out the size of a type_id. If type_id is a modifier
|
|
* (e.g. const), it will be resolved to find out the type with size.
|
|
*
|
|
* For example:
|
|
* In describing "const void *", type_id is "const" and "const"
|
|
* refers to "void *". The return type will be "void *".
|
|
*
|
|
* If type_id is a simple "int", then return type will be "int".
|
|
*
|
|
* @btf: struct btf object
|
|
* @type_id: Find out the size of type_id. The type_id of the return
|
|
* type is set to *type_id.
|
|
* @ret_size: It can be NULL. If not NULL, the size of the return
|
|
* type is set to *ret_size.
|
|
* Return: The btf_type (resolved to another type with size info if needed).
|
|
* NULL is returned if type_id itself does not have size info
|
|
* (e.g. void) or it cannot be resolved to another type that
|
|
* has size info.
|
|
* *type_id and *ret_size will not be changed in the
|
|
* NULL return case.
|
|
*/
|
|
const struct btf_type *btf_type_id_size(const struct btf *btf,
|
|
u32 *type_id,
|
|
u32 *ret_size);
|
|
void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
|
|
struct seq_file *m);
|
|
int btf_get_fd_by_id(u32 id);
|
|
u32 btf_id(const struct btf *btf);
|
|
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
|
|
const struct btf_member *m,
|
|
u32 expected_offset, u32 expected_size);
|
|
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
|
|
bool btf_type_is_void(const struct btf_type *t);
|
|
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
|
|
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
|
|
u32 id, u32 *res_id);
|
|
const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
|
|
u32 id, u32 *res_id);
|
|
const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
|
|
u32 id, u32 *res_id);
|
|
const struct btf_type *
|
|
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
|
|
u32 *type_size, const struct btf_type **elem_type,
|
|
u32 *total_nelems);
|
|
|
|
#define for_each_member(i, struct_type, member) \
|
|
for (i = 0, member = btf_type_member(struct_type); \
|
|
i < btf_type_vlen(struct_type); \
|
|
i++, member++)
|
|
|
|
static inline bool btf_type_is_ptr(const struct btf_type *t)
|
|
{
|
|
return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
|
|
}
|
|
|
|
static inline bool btf_type_is_int(const struct btf_type *t)
|
|
{
|
|
return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
|
|
}
|
|
|
|
static inline bool btf_type_is_enum(const struct btf_type *t)
|
|
{
|
|
return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
|
|
}
|
|
|
|
static inline bool btf_type_is_typedef(const struct btf_type *t)
|
|
{
|
|
return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF;
|
|
}
|
|
|
|
static inline bool btf_type_is_func(const struct btf_type *t)
|
|
{
|
|
return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
|
|
}
|
|
|
|
static inline bool btf_type_is_func_proto(const struct btf_type *t)
|
|
{
|
|
return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
|
|
}
|
|
|
|
static inline u16 btf_type_vlen(const struct btf_type *t)
|
|
{
|
|
return BTF_INFO_VLEN(t->info);
|
|
}
|
|
|
|
static inline bool btf_type_kflag(const struct btf_type *t)
|
|
{
|
|
return BTF_INFO_KFLAG(t->info);
|
|
}
|
|
|
|
static inline u32 btf_member_bit_offset(const struct btf_type *struct_type,
|
|
const struct btf_member *member)
|
|
{
|
|
return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
|
|
: member->offset;
|
|
}
|
|
|
|
static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type,
|
|
const struct btf_member *member)
|
|
{
|
|
return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
|
|
: 0;
|
|
}
|
|
|
|
static inline const struct btf_member *btf_type_member(const struct btf_type *t)
|
|
{
|
|
return (const struct btf_member *)(t + 1);
|
|
}
|
|
|
|
#ifdef CONFIG_BPF_SYSCALL
|
|
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
|
|
const char *btf_name_by_offset(const struct btf *btf, u32 offset);
|
|
struct btf *btf_parse_vmlinux(void);
|
|
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
|
|
#else
|
|
static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
|
|
u32 type_id)
|
|
{
|
|
return NULL;
|
|
}
|
|
static inline const char *btf_name_by_offset(const struct btf *btf,
|
|
u32 offset)
|
|
{
|
|
return NULL;
|
|
}
|
|
#endif
|
|
|
|
#endif
|