diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/arraymap.c | 30 | ||||
-rw-r--r-- | kernel/bpf/bpf_local_storage.c | 2 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 420 | ||||
-rw-r--r-- | kernel/bpf/cpumap.c | 9 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 38 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 6 | ||||
-rw-r--r-- | kernel/bpf/local_storage.c | 2 | ||||
-rw-r--r-- | kernel/bpf/map_in_map.c | 19 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 373 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 485 |
10 files changed, 783 insertions, 601 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 832b2659e96e..672eb17ac421 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -306,14 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_fields(struct bpf_array *arr, void *val) -{ - if (map_value_has_timer(&arr->map)) - bpf_timer_cancel_and_free(val + arr->map.timer_off); - if (map_value_has_kptrs(&arr->map)) - bpf_map_free_kptrs(&arr->map, val); -} - /* Called from syscall or from eBPF program */ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) @@ -335,13 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -349,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } return 0; } @@ -386,7 +378,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); - check_and_free_fields(array, per_cpu_ptr(pptr, cpu)); + bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); @@ -409,12 +401,12 @@ static void array_map_free_timers(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free kptr on uref dropping to zero. */ - if (!map_value_has_timer(map)) + /* We don't reset or free fields other than timer on uref dropping to zero. */ + if (!btf_record_has_field(map->record, BPF_TIMER)) return; for (i = 0; i < array->map.max_entries; i++) - bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off); + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -423,22 +415,22 @@ static void array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - if (map_value_has_kptrs(map)) { + if (!IS_ERR_OR_NULL(map->record)) { if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { for (i = 0; i < array->map.max_entries; i++) { void __percpu *pptr = array->pptrs[i & array->index_mask]; int cpu; for_each_possible_cpu(cpu) { - bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu)); + bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu)); cond_resched(); } } } else { for (i = 0; i < array->map.max_entries; i++) - bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); + bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i)); } - bpf_map_free_kptr_off_tab(map); + bpf_map_free_record(map); } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 93d9b1b17bc8..37020078d1c1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -382,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(&smap->map))) + !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 35c07afac924..5579ff3a5b54 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3191,7 +3191,7 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -enum btf_field_type { +enum btf_field_info_type { BTF_FIELD_SPIN_LOCK, BTF_FIELD_TIMER, BTF_FIELD_KPTR, @@ -3203,18 +3203,22 @@ enum { }; struct btf_field_info { - u32 type_id; + enum btf_field_type type; u32 off; - enum bpf_kptr_type type; + struct { + u32 type_id; + } kptr; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, enum btf_field_type field_type, + struct btf_field_info *info) { if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; + info->type = field_type; info->off = off; return BTF_FIELD_FOUND; } @@ -3222,9 +3226,12 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info) { - enum bpf_kptr_type type; + enum btf_field_type type; u32 res_id; + /* Permit modifiers on the pointer itself */ + if (btf_type_is_volatile(t)) + t = btf_type_by_id(btf, t->type); /* For PTR, sz is always == 8 */ if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; @@ -3248,28 +3255,66 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!__btf_type_is_struct(t)) return -EINVAL; - info->type_id = res_id; - info->off = off; info->type = type; + info->off = off; + info->kptr.type_id = res_id; return BTF_FIELD_FOUND; } -static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_type field_type, +static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, + int *align, int *sz) +{ + int type = 0; + + if (field_mask & BPF_SPIN_LOCK) { + if (!strcmp(name, "bpf_spin_lock")) { + if (*seen_mask & BPF_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_SPIN_LOCK; + type = BPF_SPIN_LOCK; + goto end; + } + } + if (field_mask & BPF_TIMER) { + if (!strcmp(name, "bpf_timer")) { + if (*seen_mask & BPF_TIMER) + return -E2BIG; + *seen_mask |= BPF_TIMER; + type = BPF_TIMER; + goto end; + } + } + /* Only return BPF_KPTR when all other types with matchable names fail */ + if (field_mask & BPF_KPTR) { + type = BPF_KPTR_REF; + goto end; + } + return 0; +end: + *sz = btf_field_type_size(type); + *align = btf_field_type_align(type); + return type; +} + +static int btf_find_struct_field(const struct btf *btf, + const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_member *member; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; off = __btf_member_bit_offset(t, member); if (off % 8) @@ -3277,17 +3322,18 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t return -EINVAL; off /= 8; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, member_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + ret = btf_find_struct(btf, member_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3307,37 +3353,41 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_var_secinfo *vsi; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - off = vsi->offset; - - if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; + + off = vsi->offset; if (vsi->size != sz) continue; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, var_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + ret = btf_find_struct(btf, var_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3357,169 +3407,203 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - enum btf_field_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { - const char *name; - int sz, align; - - switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - name = "bpf_spin_lock"; - sz = sizeof(struct bpf_spin_lock); - align = __alignof__(struct bpf_spin_lock); - break; - case BTF_FIELD_TIMER: - name = "bpf_timer"; - sz = sizeof(struct bpf_timer); - align = __alignof__(struct bpf_timer); - break; - case BTF_FIELD_KPTR: - name = NULL; - sz = sizeof(u64); - align = 8; - break; - default: - return -EFAULT; - } - if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_struct_field(btf, t, field_mask, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_datasec_var(btf, t, field_mask, info, info_cnt); return -EINVAL; } -/* find 'struct bpf_spin_lock' in map value. - * return >= 0 offset if found - * and < 0 in case of error - */ -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) { - struct btf_field_info info; + struct module *mod = NULL; + const struct btf_type *t; + struct btf *kernel_btf; int ret; + s32 id; - ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); - if (ret < 0) - return ret; - if (!ret) - return -ENOENT; - return info.off; -} + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info->kptr.type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) + return id; + + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info->type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } -int btf_find_timer(const struct btf *btf, const struct btf_type *t) -{ - struct btf_field_info info; - int ret; + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } - ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); - if (ret < 0) - return ret; - if (!ret) - return -ENOENT; - return info.off; + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + field->kptr.dtor = (void *)addr; + } + + field->kptr.btf_id = id; + field->kptr.btf = kernel_btf; + field->kptr.module = mod; + return 0; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); + return ret; } -struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, - const struct btf_type *t) +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size) { - struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; - struct bpf_map_value_off *tab; - struct btf *kernel_btf = NULL; - struct module *mod = NULL; - int ret, i, nr_off; + struct btf_field_info info_arr[BTF_FIELDS_MAX]; + struct btf_record *rec; + int ret, i, cnt; - ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); if (ret < 0) return ERR_PTR(ret); if (!ret) return NULL; - nr_off = ret; - tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); - if (!tab) + cnt = ret; + rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + if (!rec) return ERR_PTR(-ENOMEM); - for (i = 0; i < nr_off; i++) { - const struct btf_type *t; - s32 id; + rec->spin_lock_off = -EINVAL; + rec->timer_off = -EINVAL; + for (i = 0; i < cnt; i++) { + if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) { + WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size); + ret = -EFAULT; + goto end; + } - /* Find type in map BTF, and use it to look up the matching type - * in vmlinux or module BTFs, by name and kind. - */ - t = btf_type_by_id(btf, info_arr[i].type_id); - id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), - &kernel_btf); - if (id < 0) { - ret = id; + rec->field_mask |= info_arr[i].type; + rec->fields[i].offset = info_arr[i].off; + rec->fields[i].type = info_arr[i].type; + + switch (info_arr[i].type) { + case BPF_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->spin_lock_off = rec->fields[i].offset; + break; + case BPF_TIMER: + WARN_ON_ONCE(rec->timer_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->timer_off = rec->fields[i].offset; + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; + default: + ret = -EFAULT; goto end; } + rec->cnt++; + } + return rec; +end: + btf_record_free(rec); + return ERR_PTR(ret); +} - /* Find and stash the function pointer for the destruction function that - * needs to be eventually invoked from the map free path. - */ - if (info_arr[i].type == BPF_KPTR_REF) { - const struct btf_type *dtor_func; - const char *dtor_func_name; - unsigned long addr; - s32 dtor_btf_id; - - /* This call also serves as a whitelist of allowed objects that - * can be used as a referenced pointer and be stored in a map at - * the same time. - */ - dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); - if (dtor_btf_id < 0) { - ret = dtor_btf_id; - goto end_btf; - } +static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; - dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); - if (!dtor_func) { - ret = -ENOENT; - goto end_btf; - } + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} - if (btf_is_module(kernel_btf)) { - mod = btf_try_get_module(kernel_btf); - if (!mod) { - ret = -ENXIO; - goto end_btf; - } - } +static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv) +{ + struct btf_field_offs *foffs = (void *)priv; + u32 *off_base = foffs->field_off; + u32 *a = _a, *b = _b; + u8 *sz_a, *sz_b; - /* We already verified dtor_func to be btf_type_is_func - * in register_btf_id_dtor_kfuncs. - */ - dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); - addr = kallsyms_lookup_name(dtor_func_name); - if (!addr) { - ret = -EINVAL; - goto end_mod; - } - tab->off[i].kptr.dtor = (void *)addr; - } + sz_a = foffs->field_sz + (a - off_base); + sz_b = foffs->field_sz + (b - off_base); - tab->off[i].offset = info_arr[i].off; - tab->off[i].type = info_arr[i].type; - tab->off[i].kptr.btf_id = id; - tab->off[i].kptr.btf = kernel_btf; - tab->off[i].kptr.module = mod; - } - tab->nr_off = nr_off; - return tab; -end_mod: - module_put(mod); -end_btf: - btf_put(kernel_btf); -end: - while (i--) { - btf_put(tab->off[i].kptr.btf); - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); + swap(*a, *b); + swap(*sz_a, *sz_b); +} + +struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec) +{ + struct btf_field_offs *foffs; + u32 i, *off; + u8 *sz; + + BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz)); + if (IS_ERR_OR_NULL(rec) || WARN_ON_ONCE(rec->cnt > sizeof(foffs->field_off))) + return NULL; + + foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN); + if (!foffs) + return ERR_PTR(-ENOMEM); + + off = foffs->field_off; + sz = foffs->field_sz; + for (i = 0; i < rec->cnt; i++) { + off[i] = rec->fields[i].offset; + sz[i] = btf_field_type_size(rec->fields[i].type); } - kfree(tab); - return ERR_PTR(ret); + foffs->cnt = rec->cnt; + + if (foffs->cnt == 1) + return foffs; + sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]), + btf_field_offs_cmp, btf_field_offs_swap, foffs); + return foffs; } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, @@ -6367,7 +6451,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, /* kptr_get is only true for kfunc */ if (i == 0 && kptr_get) { - struct bpf_map_value_off_desc *off_desc; + struct btf_field *kptr_field; if (reg->type != PTR_TO_MAP_VALUE) { bpf_log(log, "arg#0 expected pointer to map value\n"); @@ -6383,8 +6467,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value); - if (!off_desc || off_desc->type != BPF_KPTR_REF) { + kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR); + if (!kptr_field || kptr_field->type != BPF_KPTR_REF) { bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n", reg->off + reg->var_off.value); return -EINVAL; @@ -6403,8 +6487,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, func_name, i, btf_type_str(ref_t), ref_tname); return -EINVAL; } - if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, - off_desc->kptr.btf_id, true)) { + if (!btf_struct_ids_match(log, btf, ref_id, 0, kptr_field->kptr.btf, + kptr_field->kptr.btf_id, true)) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname); return -EINVAL; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index bb03fdba73bb..6b6a78c04b90 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -4,13 +4,16 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ -/* The 'cpumap' is primarily used as a backend map for XDP BPF helper +/** + * DOC: cpu map + * The 'cpumap' is primarily used as a backend map for XDP BPF helper * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. * - * Unlike devmap which redirects XDP frames out another NIC device, + * Unlike devmap which redirects XDP frames out to another NIC device, * this map type redirects raw XDP frames to another CPU. The remote * CPU will do SKB-allocation and call the normal network stack. - * + */ +/* * This is a scalability and isolation mechanism, that allow * separating the early driver network XDP layer, from the rest of the * netstack, and assigning dedicated CPUs for this stage. This diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index f39ee3e05589..50d254cd0709 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -222,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_timer(&htab->map)) + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -231,28 +231,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } -static void htab_free_prealloced_kptrs(struct bpf_htab *htab) +static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_kptrs(&htab->map)) + if (IS_ERR_OR_NULL(htab->map.record)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); - for (i = 0; i < num_entries; i++) { struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -764,10 +761,7 @@ static void check_and_free_fields(struct bpf_htab *htab, { void *map_value = elem->key + round_up(htab->map.key_size, 8); - if (map_value_has_timer(&htab->map)) - bpf_timer_cancel_and_free(map_value + htab->map.timer_off); - if (map_value_has_kptrs(&htab->map)) - bpf_map_free_kptrs(&htab->map, map_value); + bpf_obj_free_fields(htab->map.record, map_value); } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -1091,7 +1085,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, head = &b->head; if (unlikely(map_flags & BPF_F_LOCK)) { - if (unlikely(!map_value_has_spin_lock(map))) + if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; /* find an element without taking the bucket lock */ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, @@ -1474,12 +1468,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We don't reset or free kptr on uref dropping to zero, - * hence just free timer. - */ - bpf_timer_cancel_and_free(l->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + /* We only free timer on uref dropping to zero */ + bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } @@ -1490,8 +1480,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We don't reset or free kptr on uref dropping to zero. */ - if (!map_value_has_timer(&htab->map)) + /* We only free timer on uref dropping to zero */ + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1517,11 +1507,11 @@ static void htab_map_free(struct bpf_map *map) if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { - htab_free_prealloced_kptrs(htab); + htab_free_prealloced_fields(htab); prealloc_destroy(htab); } - bpf_map_free_kptr_off_tab(map); + bpf_map_free_record(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); @@ -1675,7 +1665,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, elem_map_flags = attr->batch.elem_flags; if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; map_flags = attr->batch.flags; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 124fd199ce5c..283f55bbeb70 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -366,9 +366,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, struct bpf_spin_lock *lock; if (lock_src) - lock = src + map->spin_lock_off; + lock = src + map->record->spin_lock_off; else - lock = dst + map->spin_lock_off; + lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); @@ -1169,7 +1169,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->timer_off; + t->value = (void *)timer - map->record->timer_off; t->map = map; t->prog = NULL; rcu_assign_pointer(t->callback_fn, NULL); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 098cf336fae6..e90d9f63edc5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 135205d0d560..8ca0cca39d49 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -29,7 +29,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-ENOTSUPP); } - if (map_value_has_spin_lock(inner_map)) { + if (btf_record_has_field(inner_map->record, BPF_SPIN_LOCK)) { fdput(f); return ERR_PTR(-ENOTSUPP); } @@ -50,9 +50,15 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; - inner_map_meta->spin_lock_off = inner_map->spin_lock_off; - inner_map_meta->timer_off = inner_map->timer_off; - inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); + inner_map_meta->record = btf_record_dup(inner_map->record); + if (IS_ERR(inner_map_meta->record)) { + /* btf_record_dup returns NULL or valid pointer in case of + * invalid/empty/valid, but ERR_PTR in case of errors. During + * equality NULL or IS_ERR is equivalent. + */ + fdput(f); + return ERR_CAST(inner_map_meta->record); + } if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; @@ -72,7 +78,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) void bpf_map_meta_free(struct bpf_map *map_meta) { - bpf_map_free_kptr_off_tab(map_meta); + bpf_map_free_record(map_meta); btf_put(map_meta->btf); kfree(map_meta); } @@ -84,9 +90,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && - meta0->timer_off == meta1->timer_off && meta0->map_flags == meta1->map_flags && - bpf_map_equal_kptr_off_tab(meta0, meta1); + btf_record_equal(meta0->record, meta1->record); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5887592eeb93..85532d301124 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -495,114 +495,152 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -static int bpf_map_kptr_off_cmp(const void *a, const void *b) +static int btf_field_cmp(const void *a, const void *b) { - const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; + const struct btf_field *f1 = a, *f2 = b; - if (off_desc1->offset < off_desc2->offset) + if (f1->offset < f2->offset) return -1; - else if (off_desc1->offset > off_desc2->offset) + else if (f1->offset > f2->offset) return 1; return 0; } -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) +struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, + enum btf_field_type type) { - /* Since members are iterated in btf_find_field in increasing order, - * offsets appended to kptr_off_tab are in increasing order, so we can - * do bsearch to find exact match. - */ - struct bpf_map_value_off *tab; + struct btf_field *field; - if (!map_value_has_kptrs(map)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type)) + return NULL; + field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); + if (!field || !(field->type & type)) return NULL; - tab = map->kptr_off_tab; - return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); + return field; } -void bpf_map_free_kptr_off_tab(struct bpf_map *map) +void btf_record_free(struct btf_record *rec) { - struct bpf_map_value_off *tab = map->kptr_off_tab; int i; - if (!map_value_has_kptrs(map)) + if (IS_ERR_OR_NULL(rec)) return; - for (i = 0; i < tab->nr_off; i++) { - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - btf_put(tab->off[i].kptr.btf); + for (i = 0; i < rec->cnt; i++) { + switch (rec->fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + if (rec->fields[i].kptr.module) + module_put(rec->fields[i].kptr.module); + btf_put(rec->fields[i].kptr.btf); + break; + default: + WARN_ON_ONCE(1); + continue; + } } - kfree(tab); - map->kptr_off_tab = NULL; + kfree(rec); } -struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) +void bpf_map_free_record(struct bpf_map *map) { - struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; - int size, i; + btf_record_free(map->record); + map->record = NULL; +} - if (!map_value_has_kptrs(map)) - return ERR_PTR(-ENOENT); - size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); - new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); - if (!new_tab) +struct btf_record *btf_record_dup(const struct btf_record *rec) +{ + const struct btf_field *fields; + struct btf_record *new_rec; + int ret, size, i; + + if (IS_ERR_OR_NULL(rec)) + return NULL; + size = offsetof(struct btf_record, fields[rec->cnt]); + new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); + if (!new_rec) return ERR_PTR(-ENOMEM); - /* Do a deep copy of the kptr_off_tab */ - for (i = 0; i < tab->nr_off; i++) { - btf_get(tab->off[i].kptr.btf); - if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { - while (i--) { - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - btf_put(tab->off[i].kptr.btf); + /* Do a deep copy of the btf_record */ + fields = rec->fields; + new_rec->cnt = 0; + for (i = 0; i < rec->cnt; i++) { + switch (fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + btf_get(fields[i].kptr.btf); + if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { + ret = -ENXIO; + goto free; } - kfree(new_tab); - return ERR_PTR(-ENXIO); + break; + default: + ret = -EFAULT; + WARN_ON_ONCE(1); + goto free; } + new_rec->cnt++; } - return new_tab; + return new_rec; +free: + btf_record_free(new_rec); + return ERR_PTR(ret); } -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) +bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) { - struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; - bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); + bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); int size; - if (!a_has_kptr && !b_has_kptr) + if (!a_has_fields && !b_has_fields) return true; - if (a_has_kptr != b_has_kptr) + if (a_has_fields != b_has_fields) return false; - if (tab_a->nr_off != tab_b->nr_off) + if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); - return !memcmp(tab_a, tab_b, size); + size = offsetof(struct btf_record, fields[rec_a->cnt]); + return !memcmp(rec_a, rec_b, size); } -/* Caller must ensure map_value_has_kptrs is true. Note that this function can - * be called on a map value while the map_value is visible to BPF programs, as - * it ensures the correct synchronization, and we already enforce the same using - * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. - */ -void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) +void bpf_obj_free_timer(const struct btf_record *rec, void *obj) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - unsigned long *btf_id_ptr; - int i; + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) + return; + bpf_timer_cancel_and_free(obj + rec->timer_off); +} - for (i = 0; i < tab->nr_off; i++) { - struct bpf_map_value_off_desc *off_desc = &tab->off[i]; - unsigned long old_ptr; +void bpf_obj_free_fields(const struct btf_record *rec, void *obj) +{ + const struct btf_field *fields; + int i; - btf_id_ptr = map_value + off_desc->offset; - if (off_desc->type == BPF_KPTR_UNREF) { - u64 *p = (u64 *)btf_id_ptr; + if (IS_ERR_OR_NULL(rec)) + return; + fields = rec->fields; + for (i = 0; i < rec->cnt; i++) { + const struct btf_field *field = &fields[i]; + void *field_ptr = obj + field->offset; - WRITE_ONCE(*p, 0); + switch (fields[i].type) { + case BPF_SPIN_LOCK: + break; + case BPF_TIMER: + bpf_timer_cancel_and_free(field_ptr); + break; + case BPF_KPTR_UNREF: + WRITE_ONCE(*(u64 *)field_ptr, 0); + break; + case BPF_KPTR_REF: + field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); + break; + default: + WARN_ON_ONCE(1); continue; } - old_ptr = xchg(btf_id_ptr, 0); - off_desc->kptr.dtor((void *)old_ptr); } } @@ -612,10 +650,10 @@ static void bpf_map_free_deferred(struct work_struct *work) struct bpf_map *map = container_of(work, struct bpf_map, work); security_bpf_map_free(map); - kfree(map->off_arr); + kfree(map->field_offs); bpf_map_release_memcg(map); /* implementation dependent freeing, map_free callback also does - * bpf_map_free_kptr_off_tab, if needed. + * bpf_map_free_record, if needed. */ map->ops->map_free(map); } @@ -778,8 +816,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) struct bpf_map *map = filp->private_data; int err; - if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map) || map_value_has_kptrs(map)) + if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -906,84 +943,6 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) -{ - const u32 a = *(const u32 *)_a; - const u32 b = *(const u32 *)_b; - - if (a < b) - return -1; - else if (a > b) - return 1; - return 0; -} - -static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) -{ - struct bpf_map *map = (struct bpf_map *)priv; - u32 *off_base = map->off_arr->field_off; - u32 *a = _a, *b = _b; - u8 *sz_a, *sz_b; - - sz_a = map->off_arr->field_sz + (a - off_base); - sz_b = map->off_arr->field_sz + (b - off_base); - - swap(*a, *b); - swap(*sz_a, *sz_b); -} - -static int bpf_map_alloc_off_arr(struct bpf_map *map) -{ - bool has_spin_lock = map_value_has_spin_lock(map); - bool has_timer = map_value_has_timer(map); - bool has_kptrs = map_value_has_kptrs(map); - struct bpf_map_off_arr *off_arr; - u32 i; - - if (!has_spin_lock && !has_timer && !has_kptrs) { - map->off_arr = NULL; - return 0; - } - - off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); - if (!off_arr) - return -ENOMEM; - map->off_arr = off_arr; - - off_arr->cnt = 0; - if (has_spin_lock) { - i = off_arr->cnt; - - off_arr->field_off[i] = map->spin_lock_off; - off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); - off_arr->cnt++; - } - if (has_timer) { - i = off_arr->cnt; - - off_arr->field_off[i] = map->timer_off; - off_arr->field_sz[i] = sizeof(struct bpf_timer); - off_arr->cnt++; - } - if (has_kptrs) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - u32 *off = &off_arr->field_off[off_arr->cnt]; - u8 *sz = &off_arr->field_sz[off_arr->cnt]; - - for (i = 0; i < tab->nr_off; i++) { - *off++ = tab->off[i].offset; - *sz++ = sizeof(u64); - } - off_arr->cnt += tab->nr_off; - } - - if (off_arr->cnt == 1) - return 0; - sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), - map_off_arr_cmp, map_off_arr_swap, map); - return 0; -} - static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -1006,40 +965,11 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; - map->spin_lock_off = btf_find_spin_lock(btf, value_type); - - if (map_value_has_spin_lock(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE && - map->map_type != BPF_MAP_TYPE_TASK_STORAGE && - map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) - return -ENOTSUPP; - if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > - map->value_size) { - WARN_ONCE(1, - "verifier bug spin_lock_off %d value_size %d\n", - map->spin_lock_off, map->value_size); - return -EFAULT; - } - } - - map->timer_off = btf_find_timer(btf, value_type); - if (map_value_has_timer(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) - return -EOPNOTSUPP; - } + map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR, + map->value_size); + if (!IS_ERR_OR_NULL(map->record)) { + int i; - map->kptr_off_tab = btf_parse_kptrs(btf, value_type); - if (map_value_has_kptrs(map)) { if (!bpf_capable()) { ret = -EPERM; goto free_map_tab; @@ -1048,12 +978,45 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, ret = -EACCES; goto free_map_tab; } - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { - ret = -EOPNOTSUPP; - goto free_map_tab; + for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { + switch (map->record->field_mask & (1 << i)) { + case 0: + continue; + case BPF_SPIN_LOCK: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_TIMER: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + return -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + default: + /* Fail if map_type checks are missing for a field type */ + ret = -EOPNOTSUPP; + goto free_map_tab; + } } } @@ -1065,7 +1028,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; free_map_tab: - bpf_map_free_kptr_off_tab(map); + bpf_map_free_record(map); return ret; } @@ -1074,6 +1037,7 @@ free_map_tab: static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct btf_field_offs *foffs; struct bpf_map *map; int f_flags; int err; @@ -1118,8 +1082,6 @@ static int map_create(union bpf_attr *attr) mutex_init(&map->freeze_mutex); spin_lock_init(&map->owner.lock); - map->spin_lock_off = -EINVAL; - map->timer_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1155,13 +1117,17 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = bpf_map_alloc_off_arr(map); - if (err) + + foffs = btf_parse_field_offs(map->record); + if (IS_ERR(foffs)) { + err = PTR_ERR(foffs); goto free_map; + } + map->field_offs = foffs; err = security_bpf_map_alloc(map); if (err) - goto free_map_off_arr; + goto free_map_field_offs; err = bpf_map_alloc_id(map); if (err) @@ -1185,8 +1151,8 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); -free_map_off_arr: - kfree(map->off_arr); +free_map_field_offs: + kfree(map->field_offs); free_map: btf_put(map->btf); map->ops->map_free(map); @@ -1333,7 +1299,7 @@ static int map_lookup_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1406,7 +1372,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1569,7 +1535,7 @@ int generic_map_delete_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1626,7 +1592,7 @@ int generic_map_update_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1689,7 +1655,7 @@ int generic_map_lookup_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; value_size = bpf_map_value_size(map); @@ -1811,7 +1777,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1882,8 +1848,7 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map) || map_value_has_kptrs(map)) { + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) { fdput(f); return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e97917b97b3f..07c0259dfc1a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -262,7 +262,7 @@ struct bpf_call_arg_meta { struct btf *ret_btf; u32 ret_btf_id; u32 subprogno; - struct bpf_map_value_off_desc *kptr_off_desc; + struct btf_field *kptr_field; u8 uninit_dynptr_regno; }; @@ -454,14 +454,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && - map_value_has_spin_lock(reg->map_ptr); -} - -static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) -{ - type = base_type(type); - return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || - type == PTR_TO_MEM || type == PTR_TO_BTF_ID; + btf_record_has_field(reg->map_ptr->record, BPF_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -511,6 +504,15 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_for_each_map_elem || + func_id == BPF_FUNC_timer_set_callback || + func_id == BPF_FUNC_find_vma || + func_id == BPF_FUNC_loop || + func_id == BPF_FUNC_user_ringbuf_drain; +} + static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -875,7 +877,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (reg->id) verbose_a("id=%d", reg->id); - if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id) + if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) verbose_a("off=%d", reg->off); @@ -1400,7 +1402,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (map_value_has_timer(map->inner_map_meta)) + if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; @@ -1689,7 +1691,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; + reg->precise = !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -2658,6 +2660,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (opcode == BPF_CALL) { if (insn->src_reg == BPF_PSEUDO_CALL) return -ENOTSUPP; + /* BPF helpers that invoke callback subprogs are + * equivalent to BPF_PSEUDO_CALL above + */ + if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) + return -ENOTSUPP; /* regular helper call sets R0 */ *reg_mask &= ~1; if (*reg_mask & 0x3f) { @@ -2747,8 +2754,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, /* big hammer: mark all scalars precise in this path. * pop_stack may still get !precise scalars. + * We also skip current state and go straight to first parent state, + * because precision markings in current non-checkpointed state are + * not needed. See why in the comment in __mark_chain_precision below. */ - for (; st; st = st->parent) + for (st = st->parent; st; st = st->parent) { for (i = 0; i <= st->curframe; i++) { func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { @@ -2766,9 +2776,122 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, reg->precise = true; } } + } } -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, +static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + } +} + +/* + * __mark_chain_precision() backtracks BPF program instruction sequence and + * chain of verifier states making sure that register *regno* (if regno >= 0) + * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked + * SCALARS, as well as any other registers and slots that contribute to + * a tracked state of given registers/stack slots, depending on specific BPF + * assembly instructions (see backtrack_insns() for exact instruction handling + * logic). This backtracking relies on recorded jmp_history and is able to + * traverse entire chain of parent states. This process ends only when all the + * necessary registers/slots and their transitive dependencies are marked as + * precise. + * + * One important and subtle aspect is that precise marks *do not matter* in + * the currently verified state (current state). It is important to understand + * why this is the case. + * + * First, note that current state is the state that is not yet "checkpointed", + * i.e., it is not yet put into env->explored_states, and it has no children + * states as well. It's ephemeral, and can end up either a) being discarded if + * compatible explored state is found at some point or BPF_EXIT instruction is + * reached or b) checkpointed and put into env->explored_states, branching out + * into one or more children states. + * + * In the former case, precise markings in current state are completely + * ignored by state comparison code (see regsafe() for details). Only + * checkpointed ("old") state precise markings are important, and if old + * state's register/slot is precise, regsafe() assumes current state's + * register/slot as precise and checks value ranges exactly and precisely. If + * states turn out to be compatible, current state's necessary precise + * markings and any required parent states' precise markings are enforced + * after the fact with propagate_precision() logic, after the fact. But it's + * important to realize that in this case, even after marking current state + * registers/slots as precise, we immediately discard current state. So what + * actually matters is any of the precise markings propagated into current + * state's parent states, which are always checkpointed (due to b) case above). + * As such, for scenario a) it doesn't matter if current state has precise + * markings set or not. + * + * Now, for the scenario b), checkpointing and forking into child(ren) + * state(s). Note that before current state gets to checkpointing step, any + * processed instruction always assumes precise SCALAR register/slot + * knowledge: if precise value or range is useful to prune jump branch, BPF + * verifier takes this opportunity enthusiastically. Similarly, when + * register's value is used to calculate offset or memory address, exact + * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to + * what we mentioned above about state comparison ignoring precise markings + * during state comparison, BPF verifier ignores and also assumes precise + * markings *at will* during instruction verification process. But as verifier + * assumes precision, it also propagates any precision dependencies across + * parent states, which are not yet finalized, so can be further restricted + * based on new knowledge gained from restrictions enforced by their children + * states. This is so that once those parent states are finalized, i.e., when + * they have no more active children state, state comparison logic in + * is_state_visited() would enforce strict and precise SCALAR ranges, if + * required for correctness. + * + * To build a bit more intuition, note also that once a state is checkpointed, + * the path we took to get to that state is not important. This is crucial + * property for state pruning. When state is checkpointed and finalized at + * some instruction index, it can be correctly and safely used to "short + * circuit" any *compatible* state that reaches exactly the same instruction + * index. I.e., if we jumped to that instruction from a completely different + * code path than original finalized state was derived from, it doesn't + * matter, current state can be discarded because from that instruction + * forward having a compatible state will ensure we will safely reach the + * exit. States describe preconditions for further exploration, but completely + * forget the history of how we got here. + * + * This also means that even if we needed precise SCALAR range to get to + * finalized state, but from that point forward *that same* SCALAR register is + * never used in a precise context (i.e., it's precise value is not needed for + * correctness), it's correct and safe to mark such register as "imprecise" + * (i.e., precise marking set to false). This is what we rely on when we do + * not set precise marking in current state. If no child state requires + * precision for any given SCALAR register, it's safe to dictate that it can + * be imprecise. If any child state does require this register to be precise, + * we'll mark it precise later retroactively during precise markings + * propagation from child state to parent states. + * + * Skipping precise marking setting in current state is a mild version of + * relying on the above observation. But we can utilize this property even + * more aggressively by proactively forgetting any precise marking in the + * current state (which we inherited from the parent state), right before we + * checkpoint it and branch off into new child state. This is done by + * mark_all_scalars_imprecise() to hopefully get more permissive and generic + * finalized states which help in short circuiting more future states. + */ +static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, int spi) { struct bpf_verifier_state *st = env->cur_state; @@ -2785,18 +2908,18 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, if (!env->bpf_capable) return 0; - func = st->frame[st->curframe]; + /* Do sanity checks against current state of register and/or stack + * slot, but don't set precise flag in current state, as precision + * tracking in the current state is unnecessary. + */ + func = st->frame[frame]; if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { WARN_ONCE(1, "backtracing misuse"); return -EFAULT; } - if (!reg->precise) - new_marks = true; - else - reg_mask = 0; - reg->precise = true; + new_marks = true; } while (spi >= 0) { @@ -2809,11 +2932,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, stack_mask = 0; break; } - if (!reg->precise) - new_marks = true; - else - stack_mask = 0; - reg->precise = true; + new_marks = true; break; } @@ -2821,12 +2940,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, return 0; if (!reg_mask && !stack_mask) return 0; + for (;;) { DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + + if (last_idx < 0) { + /* we are at the entry into subprog, which + * is expected for global funcs, but only if + * requested precise registers are R1-R5 + * (which are global func's input arguments) + */ + if (st->curframe == 0 && + st->frame[0]->subprogno > 0 && + st->frame[0]->callsite == BPF_MAIN_FUNC && + stack_mask == 0 && (reg_mask & ~0x3e) == 0) { + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &st->frame[0]->regs[i]; + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); + continue; + } + reg->precise = true; + } + return 0; + } + + verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n", + st->frame[0]->subprogno, reg_mask, stack_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + for (i = last_idx;;) { if (skip_first) { err = 0; @@ -2866,7 +3015,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, break; new_marks = false; - func = st->frame[st->curframe]; + func = st->frame[frame]; bitmap_from_u64(mask, reg_mask); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; @@ -2932,12 +3081,17 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, regno, -1); + return __mark_chain_precision(env, env->cur_state->curframe, regno, -1); +} + +static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno) +{ + return __mark_chain_precision(env, frame, regno, -1); } -static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi) { - return __mark_chain_precision(env, -1, spi); + return __mark_chain_precision(env, frame, -1, spi); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -3186,14 +3340,17 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; mark_stack_slot_scratched(env, spi); - if (!env->allow_ptr_leaks - && *stype != NOT_INIT - && *stype != SCALAR_VALUE) { - /* Reject the write if there's are spilled pointers in - * range. If we didn't reject here, the ptr status - * would be erased below (even though not all slots are - * actually overwritten), possibly opening the door to - * leaks. + if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) { + /* Reject the write if range we may write to has not + * been initialized beforehand. If we didn't reject + * here, the ptr status would be erased below (even + * though not all slots are actually overwritten), + * possibly opening the door to leaks. + * + * We do however catch STACK_INVALID case below, and + * only allow reading possibly uninitialized memory + * later for CAP_PERFMON, as the write may not happen to + * that slot. */ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", insn_idx, i); @@ -3683,15 +3840,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, } static int map_kptr_match_type(struct bpf_verifier_env *env, - struct bpf_map_value_off_desc *off_desc, + struct btf_field *kptr_field, struct bpf_reg_state *reg, u32 regno) { - const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); + const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); int perm_flags = PTR_MAYBE_NULL; const char *reg_name = ""; /* Only unreferenced case accepts untrusted pointers */ - if (off_desc->type == BPF_KPTR_UNREF) + if (kptr_field->type == BPF_KPTR_UNREF) perm_flags |= PTR_UNTRUSTED; if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) @@ -3738,15 +3895,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * strict mode to true for type match. */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - off_desc->kptr.btf, off_desc->kptr.btf_id, - off_desc->type == BPF_KPTR_REF)) + kptr_field->kptr.btf, kptr_field->kptr.btf_id, + kptr_field->type == BPF_KPTR_REF)) goto bad_type; return 0; bad_type: verbose(env, "invalid kptr access, R%d type=%s%s ", regno, reg_type_str(env, reg->type), reg_name); verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); - if (off_desc->type == BPF_KPTR_UNREF) + if (kptr_field->type == BPF_KPTR_UNREF) verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED), targ_name); else @@ -3756,7 +3913,7 @@ bad_type: static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, - struct bpf_map_value_off_desc *off_desc) + struct btf_field *kptr_field) { struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; int class = BPF_CLASS(insn->code); @@ -3766,7 +3923,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * - Reject cases where variable offset may touch kptr * - size of access (must be BPF_DW) * - tnum_is_const(reg->var_off) - * - off_desc->offset == off + reg->var_off.value + * - kptr_field->offset == off + reg->var_off.value */ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ if (BPF_MODE(insn->code) != BPF_MEM) { @@ -3777,7 +3934,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { + if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -3787,19 +3944,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ - mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, - off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, + kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && - map_kptr_match_type(env, off_desc, val_reg, value_regno)) + map_kptr_match_type(env, kptr_field, val_reg, value_regno)) return -EACCES; } else if (class == BPF_ST) { if (insn->imm) { verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", - off_desc->offset); + kptr_field->offset); return -EACCES; } } else { @@ -3818,45 +3975,30 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; - int err; + struct btf_record *rec; + int err, i; err = check_mem_region_access(env, regno, off, size, map->value_size, zero_size_allowed); if (err) return err; - if (map_value_has_spin_lock(map)) { - u32 lock = map->spin_lock_off; + if (IS_ERR_OR_NULL(map->record)) + return 0; + rec = map->record; + for (i = 0; i < rec->cnt; i++) { + struct btf_field *field = &rec->fields[i]; + u32 p = field->offset; - /* if any part of struct bpf_spin_lock can be touched by - * load/store reject this program. - * To check that [x1, x2) overlaps with [y1, y2) + /* If any part of a field can be touched by load/store, reject + * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2. */ - if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && - lock < reg->umax_value + off + size) { - verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - if (map_value_has_timer(map)) { - u32 t = map->timer_off; - - if (reg->smin_value + off < t + sizeof(struct bpf_timer) && - t < reg->umax_value + off + size) { - verbose(env, "bpf_timer cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - if (map_value_has_kptrs(map)) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - int i; - - for (i = 0; i < tab->nr_off; i++) { - u32 p = tab->off[i].offset; - - if (reg->smin_value + off < p + sizeof(u64) && - p < reg->umax_value + off + size) { + if (reg->smin_value + off < p + btf_field_type_size(field->type) && + p < reg->umax_value + off + size) { + switch (field->type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -3875,10 +4017,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } break; + default: + verbose(env, "%s cannot be accessed directly by load/store\n", + btf_field_type_name(field->type)); + return -EACCES; } } } - return err; + return 0; } #define MAX_PACKET_OFF 0xffff @@ -4751,7 +4897,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { - struct bpf_map_value_off_desc *kptr_off_desc = NULL; + struct btf_field *kptr_field = NULL; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -4765,11 +4911,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; if (tnum_is_const(reg->var_off)) - kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, - off + reg->var_off.value); - if (kptr_off_desc) { - err = check_map_kptr_access(env, regno, value_regno, insn_idx, - kptr_off_desc); + kptr_field = btf_record_find(reg->map_ptr->record, + off + reg->var_off.value, BPF_KPTR); + if (kptr_field) { + err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; @@ -5160,10 +5305,6 @@ static int check_stack_range_initialized( } if (is_spilled_reg(&state->stack[spi]) && - base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) - goto mark; - - if (is_spilled_reg(&state->stack[spi]) && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { if (clobber) { @@ -5193,6 +5334,11 @@ mark: mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); + /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + * be sure that whether stack slot is written to or not. Hence, + * we must still conservatively propagate reads upwards even if + * helper may write to the entire memory range. + */ } return update_stack_depth(env, state, min_off); } @@ -5442,24 +5588,13 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (!map_value_has_spin_lock(map)) { - if (map->spin_lock_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_spin_lock'\n", - map->name); - else if (map->spin_lock_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_spin_lock'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_spin_lock is mangled\n", - map->name); + if (!btf_record_has_field(map->record, BPF_SPIN_LOCK)) { + verbose(env, "map '%s' has no valid bpf_spin_lock\n", map->name); return -EINVAL; } - if (map->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", - val + reg->off); + if (map->record->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", + val + reg->off, map->record->spin_lock_off); return -EINVAL; } if (is_lock) { @@ -5502,24 +5637,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (!map_value_has_timer(map)) { - if (map->timer_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_timer'\n", - map->name); - else if (map->timer_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_timer'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_timer is mangled\n", - map->name); + if (!btf_record_has_field(map->record, BPF_TIMER)) { + verbose(env, "map '%s' has no valid bpf_timer\n", map->name); return -EINVAL; } - if (map->timer_off != val + reg->off) { + if (map->record->timer_off != val + reg->off) { verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->timer_off); + val + reg->off, map->record->timer_off); return -EINVAL; } if (meta->map_ptr) { @@ -5535,10 +5659,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map_value_off_desc *off_desc; struct bpf_map *map_ptr = reg->map_ptr; + struct btf_field *kptr_field; u32 kptr_off; - int ret; if (!tnum_is_const(reg->var_off)) { verbose(env, @@ -5551,30 +5674,23 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, map_ptr->name); return -EINVAL; } - if (!map_value_has_kptrs(map_ptr)) { - ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); - if (ret == -E2BIG) - verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, - BPF_MAP_VALUE_OFF_MAX); - else if (ret == -EEXIST) - verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); - else - verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) { + verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); return -EINVAL; } meta->map_ptr = map_ptr; kptr_off = reg->off + reg->var_off.value; - off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); - if (!off_desc) { + kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR); + if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (off_desc->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } - meta->kptr_off_desc = off_desc; + meta->kptr_field = kptr_field; return 0; } @@ -5796,7 +5912,7 @@ found: } if (meta->func_id == BPF_FUNC_kptr_xchg) { - if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } else { if (arg_btf_id == BPF_PTR_POISON) { @@ -6651,6 +6767,10 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static int set_callee_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, int insn_idx); + static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) @@ -6701,6 +6821,16 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } + /* set_callee_state is used for direct subprog calls, but we are + * interested in validating only BPF helpers that can call subprogs as + * callbacks + */ + if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { + verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } + if (insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) { @@ -7484,7 +7614,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_type) && - map_value_has_spin_lock(meta.map_ptr)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -7548,8 +7678,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; if (func_id == BPF_FUNC_kptr_xchg) { - ret_btf = meta.kptr_off_desc->kptr.btf; - ret_btf_id = meta.kptr_off_desc->kptr.btf_id; + ret_btf = meta.kptr_field->kptr.btf; + ret_btf_id = meta.kptr_field->kptr.btf_id; } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); @@ -9207,6 +9337,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); + } else if (dst_reg->precise) { + /* if dst_reg is precise, src_reg should be precise as well */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; } } else { /* Pretend the src is a reg with a known value, since we only @@ -10395,7 +10530,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - if (map_value_has_spin_lock(map)) + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { @@ -11520,7 +11655,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (env->explore_alu_limits) return false; if (rcur->type == SCALAR_VALUE) { - if (!rold->precise && !rcur->precise) + if (!rold->precise) return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && @@ -11843,34 +11978,36 @@ static int propagate_precision(struct bpf_verifier_env *env, { struct bpf_reg_state *state_reg; struct bpf_func_state *state; - int i, err = 0; + int i, err = 0, fr; - state = old->frame[old->curframe]; - state_reg = state->regs; - for (i = 0; i < BPF_REG_FP; i++, state_reg++) { - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "propagating r%d\n", i); - err = mark_chain_precision(env, i); - if (err < 0) - return err; - } + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame %d: propagating r%d\n", i, fr); + err = mark_chain_precision_frame(env, fr, i); + if (err < 0) + return err; + } - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (!is_spilled_reg(&state->stack[i])) - continue; - state_reg = &state->stack[i].spilled_ptr; - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "propagating fp%d\n", - (-i - 1) * BPF_REG_SIZE); - err = mark_chain_precision_stack(env, i); - if (err < 0) - return err; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (!is_spilled_reg(&state->stack[i])) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame %d: propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE, fr); + err = mark_chain_precision_stack_frame(env, fr, i); + if (err < 0) + return err; + } } return 0; } @@ -12065,6 +12202,10 @@ next: env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; + /* forget precise markings we inherited, see __mark_chain_precision */ + if (env->bpf_capable) + mark_all_scalars_imprecise(env, cur); + /* add new state to the head of linked list */ new = &new_sl->state; err = copy_verifier_state(new, cur); @@ -12673,7 +12814,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); - if (map_value_has_spin_lock(map)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -12690,7 +12831,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (map_value_has_timer(map)) { + if (btf_record_has_field(map->record, BPF_TIMER)) { if (is_tracing_prog_type(prog_type)) { verbose(env, "tracing progs cannot use bpf_timer yet\n"); return -EINVAL; @@ -14613,6 +14754,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, subprog); + state->first_insn_idx = env->subprog_info[subprog].start; + state->last_insn_idx = -1; regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { |