1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2021 Facebook
3 // Copyright (c) 2021 Google
4 #include "vmlinux.h"
5 #include <bpf/bpf_helpers.h>
6 #include <bpf/bpf_tracing.h>
7 #include <bpf/bpf_core_read.h>
8 
9 #define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
10 #define MAX_EVENTS  32  // max events per cgroup: arbitrary
11 
12 // NOTE: many of map and global data will be modified before loading
13 //       from the userspace (perf tool) using the skeleton helpers.
14 
15 // single set of global perf events to measure
16 struct {
17 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
18 	__uint(key_size, sizeof(__u32));
19 	__uint(value_size, sizeof(int));
20 	__uint(max_entries, 1);
21 } events SEC(".maps");
22 
23 // from cgroup id to event index
24 struct {
25 	__uint(type, BPF_MAP_TYPE_HASH);
26 	__uint(key_size, sizeof(__u64));
27 	__uint(value_size, sizeof(__u32));
28 	__uint(max_entries, 1);
29 } cgrp_idx SEC(".maps");
30 
31 // per-cpu event snapshots to calculate delta
32 struct {
33 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
34 	__uint(key_size, sizeof(__u32));
35 	__uint(value_size, sizeof(struct bpf_perf_event_value));
36 } prev_readings SEC(".maps");
37 
38 // aggregated event values for each cgroup (per-cpu)
39 // will be read from the user-space
40 struct {
41 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
42 	__uint(key_size, sizeof(__u32));
43 	__uint(value_size, sizeof(struct bpf_perf_event_value));
44 } cgrp_readings SEC(".maps");
45 
46 const volatile __u32 num_events = 1;
47 const volatile __u32 num_cpus = 1;
48 
49 int enabled = 0;
50 int use_cgroup_v2 = 0;
51 
get_cgroup_v1_idx(__u32 * cgrps,int size)52 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
53 {
54 	struct task_struct *p = (void *)bpf_get_current_task();
55 	struct cgroup *cgrp;
56 	register int i = 0;
57 	__u32 *elem;
58 	int level;
59 	int cnt;
60 
61 	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_event_cgrp_id], cgroup);
62 	level = BPF_CORE_READ(cgrp, level);
63 
64 	for (cnt = 0; i < MAX_LEVELS; i++) {
65 		__u64 cgrp_id;
66 
67 		if (i > level)
68 			break;
69 
70 		// convert cgroup-id to a map index
71 		cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
72 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
73 		if (!elem)
74 			continue;
75 
76 		cgrps[cnt++] = *elem;
77 		if (cnt == size)
78 			break;
79 	}
80 
81 	return cnt;
82 }
83 
get_cgroup_v2_idx(__u32 * cgrps,int size)84 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
85 {
86 	register int i = 0;
87 	__u32 *elem;
88 	int cnt;
89 
90 	for (cnt = 0; i < MAX_LEVELS; i++) {
91 		__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
92 
93 		if (cgrp_id == 0)
94 			break;
95 
96 		// convert cgroup-id to a map index
97 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
98 		if (!elem)
99 			continue;
100 
101 		cgrps[cnt++] = *elem;
102 		if (cnt == size)
103 			break;
104 	}
105 
106 	return cnt;
107 }
108 
bperf_cgroup_count(void)109 static int bperf_cgroup_count(void)
110 {
111 	register __u32 idx = 0;  // to have it in a register to pass BPF verifier
112 	register int c = 0;
113 	struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
114 	__u32 cpu = bpf_get_smp_processor_id();
115 	__u32 cgrp_idx[MAX_LEVELS];
116 	int cgrp_cnt;
117 	__u32 key, cgrp;
118 	long err;
119 
120 	if (use_cgroup_v2)
121 		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
122 	else
123 		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
124 
125 	for ( ; idx < MAX_EVENTS; idx++) {
126 		if (idx == num_events)
127 			break;
128 
129 		// XXX: do not pass idx directly (for verifier)
130 		key = idx;
131 		// this is per-cpu array for diff
132 		prev_val = bpf_map_lookup_elem(&prev_readings, &key);
133 		if (!prev_val) {
134 			val.counter = val.enabled = val.running = 0;
135 			bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
136 
137 			prev_val = bpf_map_lookup_elem(&prev_readings, &key);
138 			if (!prev_val)
139 				continue;
140 		}
141 
142 		// read from global perf_event array
143 		key = idx * num_cpus + cpu;
144 		err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
145 		if (err)
146 			continue;
147 
148 		if (enabled) {
149 			delta.counter = val.counter - prev_val->counter;
150 			delta.enabled = val.enabled - prev_val->enabled;
151 			delta.running = val.running - prev_val->running;
152 
153 			for (c = 0; c < MAX_LEVELS; c++) {
154 				if (c == cgrp_cnt)
155 					break;
156 
157 				cgrp = cgrp_idx[c];
158 
159 				// aggregate the result by cgroup
160 				key = cgrp * num_events + idx;
161 				cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
162 				if (cgrp_val) {
163 					cgrp_val->counter += delta.counter;
164 					cgrp_val->enabled += delta.enabled;
165 					cgrp_val->running += delta.running;
166 				} else {
167 					bpf_map_update_elem(&cgrp_readings, &key,
168 							    &delta, BPF_ANY);
169 				}
170 			}
171 		}
172 
173 		*prev_val = val;
174 	}
175 	return 0;
176 }
177 
178 // This will be attached to cgroup-switches event for each cpu
179 SEC("perf_events")
BPF_PROG(on_cgrp_switch)180 int BPF_PROG(on_cgrp_switch)
181 {
182 	return bperf_cgroup_count();
183 }
184 
185 SEC("raw_tp/sched_switch")
BPF_PROG(trigger_read)186 int BPF_PROG(trigger_read)
187 {
188 	return bperf_cgroup_count();
189 }
190 
191 char LICENSE[] SEC("license") = "Dual BSD/GPL";
192