1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * net/sched/sch_api.c	Packet scheduler API.
4   *
5   * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6   *
7   * Fixes:
8   *
9   * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10   * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11   * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12   */
13  
14  #include <linux/module.h>
15  #include <linux/types.h>
16  #include <linux/kernel.h>
17  #include <linux/string.h>
18  #include <linux/errno.h>
19  #include <linux/skbuff.h>
20  #include <linux/init.h>
21  #include <linux/proc_fs.h>
22  #include <linux/seq_file.h>
23  #include <linux/kmod.h>
24  #include <linux/list.h>
25  #include <linux/hrtimer.h>
26  #include <linux/slab.h>
27  #include <linux/hashtable.h>
28  
29  #include <net/net_namespace.h>
30  #include <net/sock.h>
31  #include <net/netlink.h>
32  #include <net/pkt_sched.h>
33  #include <net/pkt_cls.h>
34  
35  #include <trace/events/qdisc.h>
36  
37  /*
38  
39     Short review.
40     -------------
41  
42     This file consists of two interrelated parts:
43  
44     1. queueing disciplines manager frontend.
45     2. traffic classes manager frontend.
46  
47     Generally, queueing discipline ("qdisc") is a black box,
48     which is able to enqueue packets and to dequeue them (when
49     device is ready to send something) in order and at times
50     determined by algorithm hidden in it.
51  
52     qdisc's are divided to two categories:
53     - "queues", which have no internal structure visible from outside.
54     - "schedulers", which split all the packets to "traffic classes",
55       using "packet classifiers" (look at cls_api.c)
56  
57     In turn, classes may have child qdiscs (as rule, queues)
58     attached to them etc. etc. etc.
59  
60     The goal of the routines in this file is to translate
61     information supplied by user in the form of handles
62     to more intelligible for kernel form, to make some sanity
63     checks and part of work, which is common to all qdiscs
64     and to provide rtnetlink notifications.
65  
66     All real intelligent work is done inside qdisc modules.
67  
68  
69  
70     Every discipline has two major routines: enqueue and dequeue.
71  
72     ---dequeue
73  
74     dequeue usually returns a skb to send. It is allowed to return NULL,
75     but it does not mean that queue is empty, it just means that
76     discipline does not want to send anything this time.
77     Queue is really empty if q->q.qlen == 0.
78     For complicated disciplines with multiple queues q->q is not
79     real packet queue, but however q->q.qlen must be valid.
80  
81     ---enqueue
82  
83     enqueue returns 0, if packet was enqueued successfully.
84     If packet (this one or another one) was dropped, it returns
85     not zero error code.
86     NET_XMIT_DROP 	- this packet dropped
87       Expected action: do not backoff, but wait until queue will clear.
88     NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
89       Expected action: backoff or ignore
90  
91     Auxiliary routines:
92  
93     ---peek
94  
95     like dequeue but without removing a packet from the queue
96  
97     ---reset
98  
99     returns qdisc to initial state: purge all buffers, clear all
100     timers, counters (except for statistics) etc.
101  
102     ---init
103  
104     initializes newly created qdisc.
105  
106     ---destroy
107  
108     destroys resources allocated by init and during lifetime of qdisc.
109  
110     ---change
111  
112     changes qdisc parameters.
113   */
114  
115  /* Protects list of registered TC modules. It is pure SMP lock. */
116  static DEFINE_RWLOCK(qdisc_mod_lock);
117  
118  
119  /************************************************
120   *	Queueing disciplines manipulation.	*
121   ************************************************/
122  
123  
124  /* The list of all installed queueing disciplines. */
125  
126  static struct Qdisc_ops *qdisc_base;
127  
128  /* Register/unregister queueing discipline */
129  
register_qdisc(struct Qdisc_ops * qops)130  int register_qdisc(struct Qdisc_ops *qops)
131  {
132  	struct Qdisc_ops *q, **qp;
133  	int rc = -EEXIST;
134  
135  	write_lock(&qdisc_mod_lock);
136  	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137  		if (!strcmp(qops->id, q->id))
138  			goto out;
139  
140  	if (qops->enqueue == NULL)
141  		qops->enqueue = noop_qdisc_ops.enqueue;
142  	if (qops->peek == NULL) {
143  		if (qops->dequeue == NULL)
144  			qops->peek = noop_qdisc_ops.peek;
145  		else
146  			goto out_einval;
147  	}
148  	if (qops->dequeue == NULL)
149  		qops->dequeue = noop_qdisc_ops.dequeue;
150  
151  	if (qops->cl_ops) {
152  		const struct Qdisc_class_ops *cops = qops->cl_ops;
153  
154  		if (!(cops->find && cops->walk && cops->leaf))
155  			goto out_einval;
156  
157  		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158  			goto out_einval;
159  	}
160  
161  	qops->next = NULL;
162  	*qp = qops;
163  	rc = 0;
164  out:
165  	write_unlock(&qdisc_mod_lock);
166  	return rc;
167  
168  out_einval:
169  	rc = -EINVAL;
170  	goto out;
171  }
172  EXPORT_SYMBOL(register_qdisc);
173  
unregister_qdisc(struct Qdisc_ops * qops)174  int unregister_qdisc(struct Qdisc_ops *qops)
175  {
176  	struct Qdisc_ops *q, **qp;
177  	int err = -ENOENT;
178  
179  	write_lock(&qdisc_mod_lock);
180  	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181  		if (q == qops)
182  			break;
183  	if (q) {
184  		*qp = q->next;
185  		q->next = NULL;
186  		err = 0;
187  	}
188  	write_unlock(&qdisc_mod_lock);
189  	return err;
190  }
191  EXPORT_SYMBOL(unregister_qdisc);
192  
193  /* Get default qdisc if not otherwise specified */
qdisc_get_default(char * name,size_t len)194  void qdisc_get_default(char *name, size_t len)
195  {
196  	read_lock(&qdisc_mod_lock);
197  	strlcpy(name, default_qdisc_ops->id, len);
198  	read_unlock(&qdisc_mod_lock);
199  }
200  
qdisc_lookup_default(const char * name)201  static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202  {
203  	struct Qdisc_ops *q = NULL;
204  
205  	for (q = qdisc_base; q; q = q->next) {
206  		if (!strcmp(name, q->id)) {
207  			if (!try_module_get(q->owner))
208  				q = NULL;
209  			break;
210  		}
211  	}
212  
213  	return q;
214  }
215  
216  /* Set new default qdisc to use */
qdisc_set_default(const char * name)217  int qdisc_set_default(const char *name)
218  {
219  	const struct Qdisc_ops *ops;
220  
221  	if (!capable(CAP_NET_ADMIN))
222  		return -EPERM;
223  
224  	write_lock(&qdisc_mod_lock);
225  	ops = qdisc_lookup_default(name);
226  	if (!ops) {
227  		/* Not found, drop lock and try to load module */
228  		write_unlock(&qdisc_mod_lock);
229  		request_module("sch_%s", name);
230  		write_lock(&qdisc_mod_lock);
231  
232  		ops = qdisc_lookup_default(name);
233  	}
234  
235  	if (ops) {
236  		/* Set new default */
237  		module_put(default_qdisc_ops->owner);
238  		default_qdisc_ops = ops;
239  	}
240  	write_unlock(&qdisc_mod_lock);
241  
242  	return ops ? 0 : -ENOENT;
243  }
244  
245  #ifdef CONFIG_NET_SCH_DEFAULT
246  /* Set default value from kernel config */
sch_default_qdisc(void)247  static int __init sch_default_qdisc(void)
248  {
249  	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250  }
251  late_initcall(sch_default_qdisc);
252  #endif
253  
254  /* We know handle. Find qdisc among all qdisc's attached to device
255   * (root qdisc, all its children, children of children etc.)
256   * Note: caller either uses rtnl or rcu_read_lock()
257   */
258  
qdisc_match_from_root(struct Qdisc * root,u32 handle)259  static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260  {
261  	struct Qdisc *q;
262  
263  	if (!qdisc_dev(root))
264  		return (root->handle == handle ? root : NULL);
265  
266  	if (!(root->flags & TCQ_F_BUILTIN) &&
267  	    root->handle == handle)
268  		return root;
269  
270  	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
271  				   lockdep_rtnl_is_held()) {
272  		if (q->handle == handle)
273  			return q;
274  	}
275  	return NULL;
276  }
277  
qdisc_hash_add(struct Qdisc * q,bool invisible)278  void qdisc_hash_add(struct Qdisc *q, bool invisible)
279  {
280  	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281  		ASSERT_RTNL();
282  		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283  		if (invisible)
284  			q->flags |= TCQ_F_INVISIBLE;
285  	}
286  }
287  EXPORT_SYMBOL(qdisc_hash_add);
288  
qdisc_hash_del(struct Qdisc * q)289  void qdisc_hash_del(struct Qdisc *q)
290  {
291  	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292  		ASSERT_RTNL();
293  		hash_del_rcu(&q->hash);
294  	}
295  }
296  EXPORT_SYMBOL(qdisc_hash_del);
297  
qdisc_lookup(struct net_device * dev,u32 handle)298  struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299  {
300  	struct Qdisc *q;
301  
302  	if (!handle)
303  		return NULL;
304  	q = qdisc_match_from_root(dev->qdisc, handle);
305  	if (q)
306  		goto out;
307  
308  	if (dev_ingress_queue(dev))
309  		q = qdisc_match_from_root(
310  			dev_ingress_queue(dev)->qdisc_sleeping,
311  			handle);
312  out:
313  	return q;
314  }
315  
qdisc_lookup_rcu(struct net_device * dev,u32 handle)316  struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317  {
318  	struct netdev_queue *nq;
319  	struct Qdisc *q;
320  
321  	if (!handle)
322  		return NULL;
323  	q = qdisc_match_from_root(dev->qdisc, handle);
324  	if (q)
325  		goto out;
326  
327  	nq = dev_ingress_queue_rcu(dev);
328  	if (nq)
329  		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
330  out:
331  	return q;
332  }
333  
qdisc_leaf(struct Qdisc * p,u32 classid)334  static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
335  {
336  	unsigned long cl;
337  	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338  
339  	if (cops == NULL)
340  		return NULL;
341  	cl = cops->find(p, classid);
342  
343  	if (cl == 0)
344  		return NULL;
345  	return cops->leaf(p, cl);
346  }
347  
348  /* Find queueing discipline by name */
349  
qdisc_lookup_ops(struct nlattr * kind)350  static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351  {
352  	struct Qdisc_ops *q = NULL;
353  
354  	if (kind) {
355  		read_lock(&qdisc_mod_lock);
356  		for (q = qdisc_base; q; q = q->next) {
357  			if (nla_strcmp(kind, q->id) == 0) {
358  				if (!try_module_get(q->owner))
359  					q = NULL;
360  				break;
361  			}
362  		}
363  		read_unlock(&qdisc_mod_lock);
364  	}
365  	return q;
366  }
367  
368  /* The linklayer setting were not transferred from iproute2, in older
369   * versions, and the rate tables lookup systems have been dropped in
370   * the kernel. To keep backward compatible with older iproute2 tc
371   * utils, we detect the linklayer setting by detecting if the rate
372   * table were modified.
373   *
374   * For linklayer ATM table entries, the rate table will be aligned to
375   * 48 bytes, thus some table entries will contain the same value.  The
376   * mpu (min packet unit) is also encoded into the old rate table, thus
377   * starting from the mpu, we find low and high table entries for
378   * mapping this cell.  If these entries contain the same value, when
379   * the rate tables have been modified for linklayer ATM.
380   *
381   * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382   * and then roundup to the next cell, calc the table entry one below,
383   * and compare.
384   */
__detect_linklayer(struct tc_ratespec * r,__u32 * rtab)385  static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386  {
387  	int low       = roundup(r->mpu, 48);
388  	int high      = roundup(low+1, 48);
389  	int cell_low  = low >> r->cell_log;
390  	int cell_high = (high >> r->cell_log) - 1;
391  
392  	/* rtab is too inaccurate at rates > 100Mbit/s */
393  	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
394  		pr_debug("TC linklayer: Giving up ATM detection\n");
395  		return TC_LINKLAYER_ETHERNET;
396  	}
397  
398  	if ((cell_high > cell_low) && (cell_high < 256)
399  	    && (rtab[cell_low] == rtab[cell_high])) {
400  		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401  			 cell_low, cell_high, rtab[cell_high]);
402  		return TC_LINKLAYER_ATM;
403  	}
404  	return TC_LINKLAYER_ETHERNET;
405  }
406  
407  static struct qdisc_rate_table *qdisc_rtab_list;
408  
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab,struct netlink_ext_ack * extack)409  struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410  					struct nlattr *tab,
411  					struct netlink_ext_ack *extack)
412  {
413  	struct qdisc_rate_table *rtab;
414  
415  	if (tab == NULL || r->rate == 0 ||
416  	    r->cell_log == 0 || r->cell_log >= 32 ||
417  	    nla_len(tab) != TC_RTAB_SIZE) {
418  		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419  		return NULL;
420  	}
421  
422  	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423  		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424  		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
425  			rtab->refcnt++;
426  			return rtab;
427  		}
428  	}
429  
430  	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431  	if (rtab) {
432  		rtab->rate = *r;
433  		rtab->refcnt = 1;
434  		memcpy(rtab->data, nla_data(tab), 1024);
435  		if (r->linklayer == TC_LINKLAYER_UNAWARE)
436  			r->linklayer = __detect_linklayer(r, rtab->data);
437  		rtab->next = qdisc_rtab_list;
438  		qdisc_rtab_list = rtab;
439  	} else {
440  		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441  	}
442  	return rtab;
443  }
444  EXPORT_SYMBOL(qdisc_get_rtab);
445  
qdisc_put_rtab(struct qdisc_rate_table * tab)446  void qdisc_put_rtab(struct qdisc_rate_table *tab)
447  {
448  	struct qdisc_rate_table *rtab, **rtabp;
449  
450  	if (!tab || --tab->refcnt)
451  		return;
452  
453  	for (rtabp = &qdisc_rtab_list;
454  	     (rtab = *rtabp) != NULL;
455  	     rtabp = &rtab->next) {
456  		if (rtab == tab) {
457  			*rtabp = rtab->next;
458  			kfree(rtab);
459  			return;
460  		}
461  	}
462  }
463  EXPORT_SYMBOL(qdisc_put_rtab);
464  
465  static LIST_HEAD(qdisc_stab_list);
466  
467  static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468  	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
469  	[TCA_STAB_DATA] = { .type = NLA_BINARY },
470  };
471  
qdisc_get_stab(struct nlattr * opt,struct netlink_ext_ack * extack)472  static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473  					       struct netlink_ext_ack *extack)
474  {
475  	struct nlattr *tb[TCA_STAB_MAX + 1];
476  	struct qdisc_size_table *stab;
477  	struct tc_sizespec *s;
478  	unsigned int tsize = 0;
479  	u16 *tab = NULL;
480  	int err;
481  
482  	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483  					  extack);
484  	if (err < 0)
485  		return ERR_PTR(err);
486  	if (!tb[TCA_STAB_BASE]) {
487  		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
488  		return ERR_PTR(-EINVAL);
489  	}
490  
491  	s = nla_data(tb[TCA_STAB_BASE]);
492  
493  	if (s->tsize > 0) {
494  		if (!tb[TCA_STAB_DATA]) {
495  			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
496  			return ERR_PTR(-EINVAL);
497  		}
498  		tab = nla_data(tb[TCA_STAB_DATA]);
499  		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500  	}
501  
502  	if (tsize != s->tsize || (!tab && tsize > 0)) {
503  		NL_SET_ERR_MSG(extack, "Invalid size of size table");
504  		return ERR_PTR(-EINVAL);
505  	}
506  
507  	list_for_each_entry(stab, &qdisc_stab_list, list) {
508  		if (memcmp(&stab->szopts, s, sizeof(*s)))
509  			continue;
510  		if (tsize > 0 &&
511  		    memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
512  			continue;
513  		stab->refcnt++;
514  		return stab;
515  	}
516  
517  	if (s->size_log > STAB_SIZE_LOG_MAX ||
518  	    s->cell_log > STAB_SIZE_LOG_MAX) {
519  		NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
520  		return ERR_PTR(-EINVAL);
521  	}
522  
523  	stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
524  	if (!stab)
525  		return ERR_PTR(-ENOMEM);
526  
527  	stab->refcnt = 1;
528  	stab->szopts = *s;
529  	if (tsize > 0)
530  		memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
531  
532  	list_add_tail(&stab->list, &qdisc_stab_list);
533  
534  	return stab;
535  }
536  
qdisc_put_stab(struct qdisc_size_table * tab)537  void qdisc_put_stab(struct qdisc_size_table *tab)
538  {
539  	if (!tab)
540  		return;
541  
542  	if (--tab->refcnt == 0) {
543  		list_del(&tab->list);
544  		kfree_rcu(tab, rcu);
545  	}
546  }
547  EXPORT_SYMBOL(qdisc_put_stab);
548  
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)549  static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
550  {
551  	struct nlattr *nest;
552  
553  	nest = nla_nest_start_noflag(skb, TCA_STAB);
554  	if (nest == NULL)
555  		goto nla_put_failure;
556  	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
557  		goto nla_put_failure;
558  	nla_nest_end(skb, nest);
559  
560  	return skb->len;
561  
562  nla_put_failure:
563  	return -1;
564  }
565  
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)566  void __qdisc_calculate_pkt_len(struct sk_buff *skb,
567  			       const struct qdisc_size_table *stab)
568  {
569  	int pkt_len, slot;
570  
571  	pkt_len = skb->len + stab->szopts.overhead;
572  	if (unlikely(!stab->szopts.tsize))
573  		goto out;
574  
575  	slot = pkt_len + stab->szopts.cell_align;
576  	if (unlikely(slot < 0))
577  		slot = 0;
578  
579  	slot >>= stab->szopts.cell_log;
580  	if (likely(slot < stab->szopts.tsize))
581  		pkt_len = stab->data[slot];
582  	else
583  		pkt_len = stab->data[stab->szopts.tsize - 1] *
584  				(slot / stab->szopts.tsize) +
585  				stab->data[slot % stab->szopts.tsize];
586  
587  	pkt_len <<= stab->szopts.size_log;
588  out:
589  	if (unlikely(pkt_len < 1))
590  		pkt_len = 1;
591  	qdisc_skb_cb(skb)->pkt_len = pkt_len;
592  }
593  EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
594  
qdisc_warn_nonwc(const char * txt,struct Qdisc * qdisc)595  void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
596  {
597  	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
598  		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
599  			txt, qdisc->ops->id, qdisc->handle >> 16);
600  		qdisc->flags |= TCQ_F_WARN_NONWC;
601  	}
602  }
603  EXPORT_SYMBOL(qdisc_warn_nonwc);
604  
qdisc_watchdog(struct hrtimer * timer)605  static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
606  {
607  	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
608  						 timer);
609  
610  	rcu_read_lock();
611  	__netif_schedule(qdisc_root(wd->qdisc));
612  	rcu_read_unlock();
613  
614  	return HRTIMER_NORESTART;
615  }
616  
qdisc_watchdog_init_clockid(struct qdisc_watchdog * wd,struct Qdisc * qdisc,clockid_t clockid)617  void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
618  				 clockid_t clockid)
619  {
620  	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
621  	wd->timer.function = qdisc_watchdog;
622  	wd->qdisc = qdisc;
623  }
624  EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
625  
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)626  void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
627  {
628  	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
629  }
630  EXPORT_SYMBOL(qdisc_watchdog_init);
631  
qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog * wd,u64 expires,u64 delta_ns)632  void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
633  				      u64 delta_ns)
634  {
635  	if (test_bit(__QDISC_STATE_DEACTIVATED,
636  		     &qdisc_root_sleeping(wd->qdisc)->state))
637  		return;
638  
639  	if (hrtimer_is_queued(&wd->timer)) {
640  		/* If timer is already set in [expires, expires + delta_ns],
641  		 * do not reprogram it.
642  		 */
643  		if (wd->last_expires - expires <= delta_ns)
644  			return;
645  	}
646  
647  	wd->last_expires = expires;
648  	hrtimer_start_range_ns(&wd->timer,
649  			       ns_to_ktime(expires),
650  			       delta_ns,
651  			       HRTIMER_MODE_ABS_PINNED);
652  }
653  EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
654  
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)655  void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
656  {
657  	hrtimer_cancel(&wd->timer);
658  }
659  EXPORT_SYMBOL(qdisc_watchdog_cancel);
660  
qdisc_class_hash_alloc(unsigned int n)661  static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
662  {
663  	struct hlist_head *h;
664  	unsigned int i;
665  
666  	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
667  
668  	if (h != NULL) {
669  		for (i = 0; i < n; i++)
670  			INIT_HLIST_HEAD(&h[i]);
671  	}
672  	return h;
673  }
674  
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)675  void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
676  {
677  	struct Qdisc_class_common *cl;
678  	struct hlist_node *next;
679  	struct hlist_head *nhash, *ohash;
680  	unsigned int nsize, nmask, osize;
681  	unsigned int i, h;
682  
683  	/* Rehash when load factor exceeds 0.75 */
684  	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
685  		return;
686  	nsize = clhash->hashsize * 2;
687  	nmask = nsize - 1;
688  	nhash = qdisc_class_hash_alloc(nsize);
689  	if (nhash == NULL)
690  		return;
691  
692  	ohash = clhash->hash;
693  	osize = clhash->hashsize;
694  
695  	sch_tree_lock(sch);
696  	for (i = 0; i < osize; i++) {
697  		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
698  			h = qdisc_class_hash(cl->classid, nmask);
699  			hlist_add_head(&cl->hnode, &nhash[h]);
700  		}
701  	}
702  	clhash->hash     = nhash;
703  	clhash->hashsize = nsize;
704  	clhash->hashmask = nmask;
705  	sch_tree_unlock(sch);
706  
707  	kvfree(ohash);
708  }
709  EXPORT_SYMBOL(qdisc_class_hash_grow);
710  
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)711  int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
712  {
713  	unsigned int size = 4;
714  
715  	clhash->hash = qdisc_class_hash_alloc(size);
716  	if (!clhash->hash)
717  		return -ENOMEM;
718  	clhash->hashsize  = size;
719  	clhash->hashmask  = size - 1;
720  	clhash->hashelems = 0;
721  	return 0;
722  }
723  EXPORT_SYMBOL(qdisc_class_hash_init);
724  
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)725  void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
726  {
727  	kvfree(clhash->hash);
728  }
729  EXPORT_SYMBOL(qdisc_class_hash_destroy);
730  
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)731  void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
732  			     struct Qdisc_class_common *cl)
733  {
734  	unsigned int h;
735  
736  	INIT_HLIST_NODE(&cl->hnode);
737  	h = qdisc_class_hash(cl->classid, clhash->hashmask);
738  	hlist_add_head(&cl->hnode, &clhash->hash[h]);
739  	clhash->hashelems++;
740  }
741  EXPORT_SYMBOL(qdisc_class_hash_insert);
742  
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)743  void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
744  			     struct Qdisc_class_common *cl)
745  {
746  	hlist_del(&cl->hnode);
747  	clhash->hashelems--;
748  }
749  EXPORT_SYMBOL(qdisc_class_hash_remove);
750  
751  /* Allocate an unique handle from space managed by kernel
752   * Possible range is [8000-FFFF]:0000 (0x8000 values)
753   */
qdisc_alloc_handle(struct net_device * dev)754  static u32 qdisc_alloc_handle(struct net_device *dev)
755  {
756  	int i = 0x8000;
757  	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
758  
759  	do {
760  		autohandle += TC_H_MAKE(0x10000U, 0);
761  		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
762  			autohandle = TC_H_MAKE(0x80000000U, 0);
763  		if (!qdisc_lookup(dev, autohandle))
764  			return autohandle;
765  		cond_resched();
766  	} while	(--i > 0);
767  
768  	return 0;
769  }
770  
qdisc_tree_reduce_backlog(struct Qdisc * sch,int n,int len)771  void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
772  {
773  	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
774  	const struct Qdisc_class_ops *cops;
775  	unsigned long cl;
776  	u32 parentid;
777  	bool notify;
778  	int drops;
779  
780  	if (n == 0 && len == 0)
781  		return;
782  	drops = max_t(int, n, 0);
783  	rcu_read_lock();
784  	while ((parentid = sch->parent)) {
785  		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
786  			break;
787  
788  		if (sch->flags & TCQ_F_NOPARENT)
789  			break;
790  		/* Notify parent qdisc only if child qdisc becomes empty.
791  		 *
792  		 * If child was empty even before update then backlog
793  		 * counter is screwed and we skip notification because
794  		 * parent class is already passive.
795  		 *
796  		 * If the original child was offloaded then it is allowed
797  		 * to be seem as empty, so the parent is notified anyway.
798  		 */
799  		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
800  						       !qdisc_is_offloaded);
801  		/* TODO: perform the search on a per txq basis */
802  		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
803  		if (sch == NULL) {
804  			WARN_ON_ONCE(parentid != TC_H_ROOT);
805  			break;
806  		}
807  		cops = sch->ops->cl_ops;
808  		if (notify && cops->qlen_notify) {
809  			cl = cops->find(sch, parentid);
810  			cops->qlen_notify(sch, cl);
811  		}
812  		sch->q.qlen -= n;
813  		sch->qstats.backlog -= len;
814  		__qdisc_qstats_drop(sch, drops);
815  	}
816  	rcu_read_unlock();
817  }
818  EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
819  
qdisc_offload_dump_helper(struct Qdisc * sch,enum tc_setup_type type,void * type_data)820  int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
821  			      void *type_data)
822  {
823  	struct net_device *dev = qdisc_dev(sch);
824  	int err;
825  
826  	sch->flags &= ~TCQ_F_OFFLOADED;
827  	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
828  		return 0;
829  
830  	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
831  	if (err == -EOPNOTSUPP)
832  		return 0;
833  
834  	if (!err)
835  		sch->flags |= TCQ_F_OFFLOADED;
836  
837  	return err;
838  }
839  EXPORT_SYMBOL(qdisc_offload_dump_helper);
840  
qdisc_offload_graft_helper(struct net_device * dev,struct Qdisc * sch,struct Qdisc * new,struct Qdisc * old,enum tc_setup_type type,void * type_data,struct netlink_ext_ack * extack)841  void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
842  				struct Qdisc *new, struct Qdisc *old,
843  				enum tc_setup_type type, void *type_data,
844  				struct netlink_ext_ack *extack)
845  {
846  	bool any_qdisc_is_offloaded;
847  	int err;
848  
849  	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
850  		return;
851  
852  	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
853  
854  	/* Don't report error if the graft is part of destroy operation. */
855  	if (!err || !new || new == &noop_qdisc)
856  		return;
857  
858  	/* Don't report error if the parent, the old child and the new
859  	 * one are not offloaded.
860  	 */
861  	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
862  	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
863  	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
864  
865  	if (any_qdisc_is_offloaded)
866  		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
867  }
868  EXPORT_SYMBOL(qdisc_offload_graft_helper);
869  
qdisc_offload_graft_root(struct net_device * dev,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)870  static void qdisc_offload_graft_root(struct net_device *dev,
871  				     struct Qdisc *new, struct Qdisc *old,
872  				     struct netlink_ext_ack *extack)
873  {
874  	struct tc_root_qopt_offload graft_offload = {
875  		.command	= TC_ROOT_GRAFT,
876  		.handle		= new ? new->handle : 0,
877  		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
878  				  (old && old->flags & TCQ_F_INGRESS),
879  	};
880  
881  	qdisc_offload_graft_helper(dev, NULL, new, old,
882  				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
883  }
884  
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event)885  static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
886  			 u32 portid, u32 seq, u16 flags, int event)
887  {
888  	struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
889  	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
890  	struct tcmsg *tcm;
891  	struct nlmsghdr  *nlh;
892  	unsigned char *b = skb_tail_pointer(skb);
893  	struct gnet_dump d;
894  	struct qdisc_size_table *stab;
895  	u32 block_index;
896  	__u32 qlen;
897  
898  	cond_resched();
899  	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
900  	if (!nlh)
901  		goto out_nlmsg_trim;
902  	tcm = nlmsg_data(nlh);
903  	tcm->tcm_family = AF_UNSPEC;
904  	tcm->tcm__pad1 = 0;
905  	tcm->tcm__pad2 = 0;
906  	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
907  	tcm->tcm_parent = clid;
908  	tcm->tcm_handle = q->handle;
909  	tcm->tcm_info = refcount_read(&q->refcnt);
910  	if (nla_put_string(skb, TCA_KIND, q->ops->id))
911  		goto nla_put_failure;
912  	if (q->ops->ingress_block_get) {
913  		block_index = q->ops->ingress_block_get(q);
914  		if (block_index &&
915  		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
916  			goto nla_put_failure;
917  	}
918  	if (q->ops->egress_block_get) {
919  		block_index = q->ops->egress_block_get(q);
920  		if (block_index &&
921  		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
922  			goto nla_put_failure;
923  	}
924  	if (q->ops->dump && q->ops->dump(q, skb) < 0)
925  		goto nla_put_failure;
926  	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
927  		goto nla_put_failure;
928  	qlen = qdisc_qlen_sum(q);
929  
930  	stab = rtnl_dereference(q->stab);
931  	if (stab && qdisc_dump_stab(skb, stab) < 0)
932  		goto nla_put_failure;
933  
934  	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
935  					 NULL, &d, TCA_PAD) < 0)
936  		goto nla_put_failure;
937  
938  	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
939  		goto nla_put_failure;
940  
941  	if (qdisc_is_percpu_stats(q)) {
942  		cpu_bstats = q->cpu_bstats;
943  		cpu_qstats = q->cpu_qstats;
944  	}
945  
946  	if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
947  	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
948  	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
949  		goto nla_put_failure;
950  
951  	if (gnet_stats_finish_copy(&d) < 0)
952  		goto nla_put_failure;
953  
954  	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
955  	return skb->len;
956  
957  out_nlmsg_trim:
958  nla_put_failure:
959  	nlmsg_trim(skb, b);
960  	return -1;
961  }
962  
tc_qdisc_dump_ignore(struct Qdisc * q,bool dump_invisible)963  static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
964  {
965  	if (q->flags & TCQ_F_BUILTIN)
966  		return true;
967  	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
968  		return true;
969  
970  	return false;
971  }
972  
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)973  static int qdisc_notify(struct net *net, struct sk_buff *oskb,
974  			struct nlmsghdr *n, u32 clid,
975  			struct Qdisc *old, struct Qdisc *new)
976  {
977  	struct sk_buff *skb;
978  	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
979  
980  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
981  	if (!skb)
982  		return -ENOBUFS;
983  
984  	if (old && !tc_qdisc_dump_ignore(old, false)) {
985  		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
986  				  0, RTM_DELQDISC) < 0)
987  			goto err_out;
988  	}
989  	if (new && !tc_qdisc_dump_ignore(new, false)) {
990  		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
991  				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
992  			goto err_out;
993  	}
994  
995  	if (skb->len)
996  		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
997  				      n->nlmsg_flags & NLM_F_ECHO);
998  
999  err_out:
1000  	kfree_skb(skb);
1001  	return -EINVAL;
1002  }
1003  
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)1004  static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1005  			       struct nlmsghdr *n, u32 clid,
1006  			       struct Qdisc *old, struct Qdisc *new)
1007  {
1008  	if (new || old)
1009  		qdisc_notify(net, skb, n, clid, old, new);
1010  
1011  	if (old)
1012  		qdisc_put(old);
1013  }
1014  
qdisc_clear_nolock(struct Qdisc * sch)1015  static void qdisc_clear_nolock(struct Qdisc *sch)
1016  {
1017  	sch->flags &= ~TCQ_F_NOLOCK;
1018  	if (!(sch->flags & TCQ_F_CPUSTATS))
1019  		return;
1020  
1021  	free_percpu(sch->cpu_bstats);
1022  	free_percpu(sch->cpu_qstats);
1023  	sch->cpu_bstats = NULL;
1024  	sch->cpu_qstats = NULL;
1025  	sch->flags &= ~TCQ_F_CPUSTATS;
1026  }
1027  
1028  /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1029   * to device "dev".
1030   *
1031   * When appropriate send a netlink notification using 'skb'
1032   * and "n".
1033   *
1034   * On success, destroy old qdisc.
1035   */
1036  
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)1037  static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1038  		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1039  		       struct Qdisc *new, struct Qdisc *old,
1040  		       struct netlink_ext_ack *extack)
1041  {
1042  	struct Qdisc *q = old;
1043  	struct net *net = dev_net(dev);
1044  
1045  	if (parent == NULL) {
1046  		unsigned int i, num_q, ingress;
1047  
1048  		ingress = 0;
1049  		num_q = dev->num_tx_queues;
1050  		if ((q && q->flags & TCQ_F_INGRESS) ||
1051  		    (new && new->flags & TCQ_F_INGRESS)) {
1052  			num_q = 1;
1053  			ingress = 1;
1054  			if (!dev_ingress_queue(dev)) {
1055  				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1056  				return -ENOENT;
1057  			}
1058  		}
1059  
1060  		if (dev->flags & IFF_UP)
1061  			dev_deactivate(dev);
1062  
1063  		qdisc_offload_graft_root(dev, new, old, extack);
1064  
1065  		if (new && new->ops->attach)
1066  			goto skip;
1067  
1068  		for (i = 0; i < num_q; i++) {
1069  			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1070  
1071  			if (!ingress)
1072  				dev_queue = netdev_get_tx_queue(dev, i);
1073  
1074  			old = dev_graft_qdisc(dev_queue, new);
1075  			if (new && i > 0)
1076  				qdisc_refcount_inc(new);
1077  
1078  			if (!ingress)
1079  				qdisc_put(old);
1080  		}
1081  
1082  skip:
1083  		if (!ingress) {
1084  			notify_and_destroy(net, skb, n, classid,
1085  					   dev->qdisc, new);
1086  			if (new && !new->ops->attach)
1087  				qdisc_refcount_inc(new);
1088  			dev->qdisc = new ? : &noop_qdisc;
1089  
1090  			if (new && new->ops->attach)
1091  				new->ops->attach(new);
1092  		} else {
1093  			notify_and_destroy(net, skb, n, classid, old, new);
1094  		}
1095  
1096  		if (dev->flags & IFF_UP)
1097  			dev_activate(dev);
1098  	} else {
1099  		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1100  		unsigned long cl;
1101  		int err;
1102  
1103  		/* Only support running class lockless if parent is lockless */
1104  		if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1105  			qdisc_clear_nolock(new);
1106  
1107  		if (!cops || !cops->graft)
1108  			return -EOPNOTSUPP;
1109  
1110  		cl = cops->find(parent, classid);
1111  		if (!cl) {
1112  			NL_SET_ERR_MSG(extack, "Specified class not found");
1113  			return -ENOENT;
1114  		}
1115  
1116  		err = cops->graft(parent, cl, new, &old, extack);
1117  		if (err)
1118  			return err;
1119  		notify_and_destroy(net, skb, n, classid, old, new);
1120  	}
1121  	return 0;
1122  }
1123  
qdisc_block_indexes_set(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1124  static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1125  				   struct netlink_ext_ack *extack)
1126  {
1127  	u32 block_index;
1128  
1129  	if (tca[TCA_INGRESS_BLOCK]) {
1130  		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1131  
1132  		if (!block_index) {
1133  			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1134  			return -EINVAL;
1135  		}
1136  		if (!sch->ops->ingress_block_set) {
1137  			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1138  			return -EOPNOTSUPP;
1139  		}
1140  		sch->ops->ingress_block_set(sch, block_index);
1141  	}
1142  	if (tca[TCA_EGRESS_BLOCK]) {
1143  		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1144  
1145  		if (!block_index) {
1146  			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1147  			return -EINVAL;
1148  		}
1149  		if (!sch->ops->egress_block_set) {
1150  			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1151  			return -EOPNOTSUPP;
1152  		}
1153  		sch->ops->egress_block_set(sch, block_index);
1154  	}
1155  	return 0;
1156  }
1157  
1158  /*
1159     Allocate and initialize new qdisc.
1160  
1161     Parameters are passed via opt.
1162   */
1163  
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,struct Qdisc * p,u32 parent,u32 handle,struct nlattr ** tca,int * errp,struct netlink_ext_ack * extack)1164  static struct Qdisc *qdisc_create(struct net_device *dev,
1165  				  struct netdev_queue *dev_queue,
1166  				  struct Qdisc *p, u32 parent, u32 handle,
1167  				  struct nlattr **tca, int *errp,
1168  				  struct netlink_ext_ack *extack)
1169  {
1170  	int err;
1171  	struct nlattr *kind = tca[TCA_KIND];
1172  	struct Qdisc *sch;
1173  	struct Qdisc_ops *ops;
1174  	struct qdisc_size_table *stab;
1175  
1176  	ops = qdisc_lookup_ops(kind);
1177  #ifdef CONFIG_MODULES
1178  	if (ops == NULL && kind != NULL) {
1179  		char name[IFNAMSIZ];
1180  		if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1181  			/* We dropped the RTNL semaphore in order to
1182  			 * perform the module load.  So, even if we
1183  			 * succeeded in loading the module we have to
1184  			 * tell the caller to replay the request.  We
1185  			 * indicate this using -EAGAIN.
1186  			 * We replay the request because the device may
1187  			 * go away in the mean time.
1188  			 */
1189  			rtnl_unlock();
1190  			request_module("sch_%s", name);
1191  			rtnl_lock();
1192  			ops = qdisc_lookup_ops(kind);
1193  			if (ops != NULL) {
1194  				/* We will try again qdisc_lookup_ops,
1195  				 * so don't keep a reference.
1196  				 */
1197  				module_put(ops->owner);
1198  				err = -EAGAIN;
1199  				goto err_out;
1200  			}
1201  		}
1202  	}
1203  #endif
1204  
1205  	err = -ENOENT;
1206  	if (!ops) {
1207  		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1208  		goto err_out;
1209  	}
1210  
1211  	sch = qdisc_alloc(dev_queue, ops, extack);
1212  	if (IS_ERR(sch)) {
1213  		err = PTR_ERR(sch);
1214  		goto err_out2;
1215  	}
1216  
1217  	sch->parent = parent;
1218  
1219  	if (handle == TC_H_INGRESS) {
1220  		sch->flags |= TCQ_F_INGRESS;
1221  		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1222  	} else {
1223  		if (handle == 0) {
1224  			handle = qdisc_alloc_handle(dev);
1225  			if (handle == 0) {
1226  				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1227  				err = -ENOSPC;
1228  				goto err_out3;
1229  			}
1230  		}
1231  		if (!netif_is_multiqueue(dev))
1232  			sch->flags |= TCQ_F_ONETXQUEUE;
1233  	}
1234  
1235  	sch->handle = handle;
1236  
1237  	/* This exist to keep backward compatible with a userspace
1238  	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1239  	 * facility on older kernels by setting tx_queue_len=0 (prior
1240  	 * to qdisc init), and then forgot to reinit tx_queue_len
1241  	 * before again attaching a qdisc.
1242  	 */
1243  	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1244  		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1245  		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1246  	}
1247  
1248  	err = qdisc_block_indexes_set(sch, tca, extack);
1249  	if (err)
1250  		goto err_out3;
1251  
1252  	if (ops->init) {
1253  		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1254  		if (err != 0)
1255  			goto err_out5;
1256  	}
1257  
1258  	if (tca[TCA_STAB]) {
1259  		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1260  		if (IS_ERR(stab)) {
1261  			err = PTR_ERR(stab);
1262  			goto err_out4;
1263  		}
1264  		rcu_assign_pointer(sch->stab, stab);
1265  	}
1266  	if (tca[TCA_RATE]) {
1267  		err = -EOPNOTSUPP;
1268  		if (sch->flags & TCQ_F_MQROOT) {
1269  			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1270  			goto err_out4;
1271  		}
1272  
1273  		err = gen_new_estimator(&sch->bstats,
1274  					sch->cpu_bstats,
1275  					&sch->rate_est,
1276  					NULL,
1277  					true,
1278  					tca[TCA_RATE]);
1279  		if (err) {
1280  			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1281  			goto err_out4;
1282  		}
1283  	}
1284  
1285  	qdisc_hash_add(sch, false);
1286  	trace_qdisc_create(ops, dev, parent);
1287  
1288  	return sch;
1289  
1290  err_out5:
1291  	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1292  	if (ops->destroy)
1293  		ops->destroy(sch);
1294  err_out3:
1295  	dev_put(dev);
1296  	qdisc_free(sch);
1297  err_out2:
1298  	module_put(ops->owner);
1299  err_out:
1300  	*errp = err;
1301  	return NULL;
1302  
1303  err_out4:
1304  	/*
1305  	 * Any broken qdiscs that would require a ops->reset() here?
1306  	 * The qdisc was never in action so it shouldn't be necessary.
1307  	 */
1308  	qdisc_put_stab(rtnl_dereference(sch->stab));
1309  	if (ops->destroy)
1310  		ops->destroy(sch);
1311  	goto err_out3;
1312  }
1313  
qdisc_change(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1314  static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1315  			struct netlink_ext_ack *extack)
1316  {
1317  	struct qdisc_size_table *ostab, *stab = NULL;
1318  	int err = 0;
1319  
1320  	if (tca[TCA_OPTIONS]) {
1321  		if (!sch->ops->change) {
1322  			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1323  			return -EINVAL;
1324  		}
1325  		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1326  			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1327  			return -EOPNOTSUPP;
1328  		}
1329  		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1330  		if (err)
1331  			return err;
1332  	}
1333  
1334  	if (tca[TCA_STAB]) {
1335  		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1336  		if (IS_ERR(stab))
1337  			return PTR_ERR(stab);
1338  	}
1339  
1340  	ostab = rtnl_dereference(sch->stab);
1341  	rcu_assign_pointer(sch->stab, stab);
1342  	qdisc_put_stab(ostab);
1343  
1344  	if (tca[TCA_RATE]) {
1345  		/* NB: ignores errors from replace_estimator
1346  		   because change can't be undone. */
1347  		if (sch->flags & TCQ_F_MQROOT)
1348  			goto out;
1349  		gen_replace_estimator(&sch->bstats,
1350  				      sch->cpu_bstats,
1351  				      &sch->rate_est,
1352  				      NULL,
1353  				      true,
1354  				      tca[TCA_RATE]);
1355  	}
1356  out:
1357  	return 0;
1358  }
1359  
1360  struct check_loop_arg {
1361  	struct qdisc_walker	w;
1362  	struct Qdisc		*p;
1363  	int			depth;
1364  };
1365  
1366  static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1367  			 struct qdisc_walker *w);
1368  
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)1369  static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1370  {
1371  	struct check_loop_arg	arg;
1372  
1373  	if (q->ops->cl_ops == NULL)
1374  		return 0;
1375  
1376  	arg.w.stop = arg.w.skip = arg.w.count = 0;
1377  	arg.w.fn = check_loop_fn;
1378  	arg.depth = depth;
1379  	arg.p = p;
1380  	q->ops->cl_ops->walk(q, &arg.w);
1381  	return arg.w.stop ? -ELOOP : 0;
1382  }
1383  
1384  static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1385  check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1386  {
1387  	struct Qdisc *leaf;
1388  	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1389  	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1390  
1391  	leaf = cops->leaf(q, cl);
1392  	if (leaf) {
1393  		if (leaf == arg->p || arg->depth > 7)
1394  			return -ELOOP;
1395  		return check_loop(leaf, arg->p, arg->depth + 1);
1396  	}
1397  	return 0;
1398  }
1399  
1400  const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1401  	[TCA_KIND]		= { .type = NLA_STRING },
1402  	[TCA_RATE]		= { .type = NLA_BINARY,
1403  				    .len = sizeof(struct tc_estimator) },
1404  	[TCA_STAB]		= { .type = NLA_NESTED },
1405  	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1406  	[TCA_CHAIN]		= { .type = NLA_U32 },
1407  	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1408  	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1409  };
1410  
1411  /*
1412   * Delete/get qdisc.
1413   */
1414  
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1415  static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1416  			struct netlink_ext_ack *extack)
1417  {
1418  	struct net *net = sock_net(skb->sk);
1419  	struct tcmsg *tcm = nlmsg_data(n);
1420  	struct nlattr *tca[TCA_MAX + 1];
1421  	struct net_device *dev;
1422  	u32 clid;
1423  	struct Qdisc *q = NULL;
1424  	struct Qdisc *p = NULL;
1425  	int err;
1426  
1427  	if ((n->nlmsg_type != RTM_GETQDISC) &&
1428  	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1429  		return -EPERM;
1430  
1431  	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1432  				     rtm_tca_policy, extack);
1433  	if (err < 0)
1434  		return err;
1435  
1436  	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1437  	if (!dev)
1438  		return -ENODEV;
1439  
1440  	clid = tcm->tcm_parent;
1441  	if (clid) {
1442  		if (clid != TC_H_ROOT) {
1443  			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1444  				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1445  				if (!p) {
1446  					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1447  					return -ENOENT;
1448  				}
1449  				q = qdisc_leaf(p, clid);
1450  			} else if (dev_ingress_queue(dev)) {
1451  				q = dev_ingress_queue(dev)->qdisc_sleeping;
1452  			}
1453  		} else {
1454  			q = dev->qdisc;
1455  		}
1456  		if (!q) {
1457  			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1458  			return -ENOENT;
1459  		}
1460  
1461  		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1462  			NL_SET_ERR_MSG(extack, "Invalid handle");
1463  			return -EINVAL;
1464  		}
1465  	} else {
1466  		q = qdisc_lookup(dev, tcm->tcm_handle);
1467  		if (!q) {
1468  			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1469  			return -ENOENT;
1470  		}
1471  	}
1472  
1473  	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1474  		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1475  		return -EINVAL;
1476  	}
1477  
1478  	if (n->nlmsg_type == RTM_DELQDISC) {
1479  		if (!clid) {
1480  			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1481  			return -EINVAL;
1482  		}
1483  		if (q->handle == 0) {
1484  			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1485  			return -ENOENT;
1486  		}
1487  		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1488  		if (err != 0)
1489  			return err;
1490  	} else {
1491  		qdisc_notify(net, skb, n, clid, NULL, q);
1492  	}
1493  	return 0;
1494  }
1495  
1496  /*
1497   * Create/change qdisc.
1498   */
1499  
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1500  static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1501  			   struct netlink_ext_ack *extack)
1502  {
1503  	struct net *net = sock_net(skb->sk);
1504  	struct tcmsg *tcm;
1505  	struct nlattr *tca[TCA_MAX + 1];
1506  	struct net_device *dev;
1507  	u32 clid;
1508  	struct Qdisc *q, *p;
1509  	int err;
1510  
1511  	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1512  		return -EPERM;
1513  
1514  replay:
1515  	/* Reinit, just in case something touches this. */
1516  	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1517  				     rtm_tca_policy, extack);
1518  	if (err < 0)
1519  		return err;
1520  
1521  	tcm = nlmsg_data(n);
1522  	clid = tcm->tcm_parent;
1523  	q = p = NULL;
1524  
1525  	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1526  	if (!dev)
1527  		return -ENODEV;
1528  
1529  
1530  	if (clid) {
1531  		if (clid != TC_H_ROOT) {
1532  			if (clid != TC_H_INGRESS) {
1533  				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1534  				if (!p) {
1535  					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1536  					return -ENOENT;
1537  				}
1538  				q = qdisc_leaf(p, clid);
1539  			} else if (dev_ingress_queue_create(dev)) {
1540  				q = dev_ingress_queue(dev)->qdisc_sleeping;
1541  			}
1542  		} else {
1543  			q = dev->qdisc;
1544  		}
1545  
1546  		/* It may be default qdisc, ignore it */
1547  		if (q && q->handle == 0)
1548  			q = NULL;
1549  
1550  		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1551  			if (tcm->tcm_handle) {
1552  				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1553  					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1554  					return -EEXIST;
1555  				}
1556  				if (TC_H_MIN(tcm->tcm_handle)) {
1557  					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1558  					return -EINVAL;
1559  				}
1560  				q = qdisc_lookup(dev, tcm->tcm_handle);
1561  				if (!q)
1562  					goto create_n_graft;
1563  				if (n->nlmsg_flags & NLM_F_EXCL) {
1564  					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1565  					return -EEXIST;
1566  				}
1567  				if (tca[TCA_KIND] &&
1568  				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1569  					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1570  					return -EINVAL;
1571  				}
1572  				if (q == p ||
1573  				    (p && check_loop(q, p, 0))) {
1574  					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1575  					return -ELOOP;
1576  				}
1577  				qdisc_refcount_inc(q);
1578  				goto graft;
1579  			} else {
1580  				if (!q)
1581  					goto create_n_graft;
1582  
1583  				/* This magic test requires explanation.
1584  				 *
1585  				 *   We know, that some child q is already
1586  				 *   attached to this parent and have choice:
1587  				 *   either to change it or to create/graft new one.
1588  				 *
1589  				 *   1. We are allowed to create/graft only
1590  				 *   if CREATE and REPLACE flags are set.
1591  				 *
1592  				 *   2. If EXCL is set, requestor wanted to say,
1593  				 *   that qdisc tcm_handle is not expected
1594  				 *   to exist, so that we choose create/graft too.
1595  				 *
1596  				 *   3. The last case is when no flags are set.
1597  				 *   Alas, it is sort of hole in API, we
1598  				 *   cannot decide what to do unambiguously.
1599  				 *   For now we select create/graft, if
1600  				 *   user gave KIND, which does not match existing.
1601  				 */
1602  				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1603  				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1604  				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1605  				     (tca[TCA_KIND] &&
1606  				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1607  					goto create_n_graft;
1608  			}
1609  		}
1610  	} else {
1611  		if (!tcm->tcm_handle) {
1612  			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1613  			return -EINVAL;
1614  		}
1615  		q = qdisc_lookup(dev, tcm->tcm_handle);
1616  	}
1617  
1618  	/* Change qdisc parameters */
1619  	if (!q) {
1620  		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1621  		return -ENOENT;
1622  	}
1623  	if (n->nlmsg_flags & NLM_F_EXCL) {
1624  		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1625  		return -EEXIST;
1626  	}
1627  	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1628  		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1629  		return -EINVAL;
1630  	}
1631  	err = qdisc_change(q, tca, extack);
1632  	if (err == 0)
1633  		qdisc_notify(net, skb, n, clid, NULL, q);
1634  	return err;
1635  
1636  create_n_graft:
1637  	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1638  		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1639  		return -ENOENT;
1640  	}
1641  	if (clid == TC_H_INGRESS) {
1642  		if (dev_ingress_queue(dev)) {
1643  			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1644  					 tcm->tcm_parent, tcm->tcm_parent,
1645  					 tca, &err, extack);
1646  		} else {
1647  			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1648  			err = -ENOENT;
1649  		}
1650  	} else {
1651  		struct netdev_queue *dev_queue;
1652  
1653  		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1654  			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1655  		else if (p)
1656  			dev_queue = p->dev_queue;
1657  		else
1658  			dev_queue = netdev_get_tx_queue(dev, 0);
1659  
1660  		q = qdisc_create(dev, dev_queue, p,
1661  				 tcm->tcm_parent, tcm->tcm_handle,
1662  				 tca, &err, extack);
1663  	}
1664  	if (q == NULL) {
1665  		if (err == -EAGAIN)
1666  			goto replay;
1667  		return err;
1668  	}
1669  
1670  graft:
1671  	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1672  	if (err) {
1673  		if (q)
1674  			qdisc_put(q);
1675  		return err;
1676  	}
1677  
1678  	return 0;
1679  }
1680  
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx,bool recur,bool dump_invisible)1681  static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1682  			      struct netlink_callback *cb,
1683  			      int *q_idx_p, int s_q_idx, bool recur,
1684  			      bool dump_invisible)
1685  {
1686  	int ret = 0, q_idx = *q_idx_p;
1687  	struct Qdisc *q;
1688  	int b;
1689  
1690  	if (!root)
1691  		return 0;
1692  
1693  	q = root;
1694  	if (q_idx < s_q_idx) {
1695  		q_idx++;
1696  	} else {
1697  		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1698  		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1699  				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1700  				  RTM_NEWQDISC) <= 0)
1701  			goto done;
1702  		q_idx++;
1703  	}
1704  
1705  	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1706  	 * itself has already been dumped.
1707  	 *
1708  	 * If we've already dumped the top-level (ingress) qdisc above and the global
1709  	 * qdisc hashtable, we don't want to hit it again
1710  	 */
1711  	if (!qdisc_dev(root) || !recur)
1712  		goto out;
1713  
1714  	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1715  		if (q_idx < s_q_idx) {
1716  			q_idx++;
1717  			continue;
1718  		}
1719  		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1720  		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1721  				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1722  				  RTM_NEWQDISC) <= 0)
1723  			goto done;
1724  		q_idx++;
1725  	}
1726  
1727  out:
1728  	*q_idx_p = q_idx;
1729  	return ret;
1730  done:
1731  	ret = -1;
1732  	goto out;
1733  }
1734  
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1735  static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1736  {
1737  	struct net *net = sock_net(skb->sk);
1738  	int idx, q_idx;
1739  	int s_idx, s_q_idx;
1740  	struct net_device *dev;
1741  	const struct nlmsghdr *nlh = cb->nlh;
1742  	struct nlattr *tca[TCA_MAX + 1];
1743  	int err;
1744  
1745  	s_idx = cb->args[0];
1746  	s_q_idx = q_idx = cb->args[1];
1747  
1748  	idx = 0;
1749  	ASSERT_RTNL();
1750  
1751  	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1752  				     rtm_tca_policy, cb->extack);
1753  	if (err < 0)
1754  		return err;
1755  
1756  	for_each_netdev(net, dev) {
1757  		struct netdev_queue *dev_queue;
1758  
1759  		if (idx < s_idx)
1760  			goto cont;
1761  		if (idx > s_idx)
1762  			s_q_idx = 0;
1763  		q_idx = 0;
1764  
1765  		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1766  				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1767  			goto done;
1768  
1769  		dev_queue = dev_ingress_queue(dev);
1770  		if (dev_queue &&
1771  		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1772  				       &q_idx, s_q_idx, false,
1773  				       tca[TCA_DUMP_INVISIBLE]) < 0)
1774  			goto done;
1775  
1776  cont:
1777  		idx++;
1778  	}
1779  
1780  done:
1781  	cb->args[0] = idx;
1782  	cb->args[1] = q_idx;
1783  
1784  	return skb->len;
1785  }
1786  
1787  
1788  
1789  /************************************************
1790   *	Traffic classes manipulation.		*
1791   ************************************************/
1792  
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event)1793  static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1794  			  unsigned long cl,
1795  			  u32 portid, u32 seq, u16 flags, int event)
1796  {
1797  	struct tcmsg *tcm;
1798  	struct nlmsghdr  *nlh;
1799  	unsigned char *b = skb_tail_pointer(skb);
1800  	struct gnet_dump d;
1801  	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1802  
1803  	cond_resched();
1804  	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1805  	if (!nlh)
1806  		goto out_nlmsg_trim;
1807  	tcm = nlmsg_data(nlh);
1808  	tcm->tcm_family = AF_UNSPEC;
1809  	tcm->tcm__pad1 = 0;
1810  	tcm->tcm__pad2 = 0;
1811  	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1812  	tcm->tcm_parent = q->handle;
1813  	tcm->tcm_handle = q->handle;
1814  	tcm->tcm_info = 0;
1815  	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1816  		goto nla_put_failure;
1817  	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1818  		goto nla_put_failure;
1819  
1820  	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1821  					 NULL, &d, TCA_PAD) < 0)
1822  		goto nla_put_failure;
1823  
1824  	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1825  		goto nla_put_failure;
1826  
1827  	if (gnet_stats_finish_copy(&d) < 0)
1828  		goto nla_put_failure;
1829  
1830  	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1831  	return skb->len;
1832  
1833  out_nlmsg_trim:
1834  nla_put_failure:
1835  	nlmsg_trim(skb, b);
1836  	return -1;
1837  }
1838  
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1839  static int tclass_notify(struct net *net, struct sk_buff *oskb,
1840  			 struct nlmsghdr *n, struct Qdisc *q,
1841  			 unsigned long cl, int event)
1842  {
1843  	struct sk_buff *skb;
1844  	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1845  
1846  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1847  	if (!skb)
1848  		return -ENOBUFS;
1849  
1850  	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1851  		kfree_skb(skb);
1852  		return -EINVAL;
1853  	}
1854  
1855  	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1856  			      n->nlmsg_flags & NLM_F_ECHO);
1857  }
1858  
tclass_del_notify(struct net * net,const struct Qdisc_class_ops * cops,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,struct netlink_ext_ack * extack)1859  static int tclass_del_notify(struct net *net,
1860  			     const struct Qdisc_class_ops *cops,
1861  			     struct sk_buff *oskb, struct nlmsghdr *n,
1862  			     struct Qdisc *q, unsigned long cl,
1863  			     struct netlink_ext_ack *extack)
1864  {
1865  	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1866  	struct sk_buff *skb;
1867  	int err = 0;
1868  
1869  	if (!cops->delete)
1870  		return -EOPNOTSUPP;
1871  
1872  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1873  	if (!skb)
1874  		return -ENOBUFS;
1875  
1876  	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1877  			   RTM_DELTCLASS) < 0) {
1878  		kfree_skb(skb);
1879  		return -EINVAL;
1880  	}
1881  
1882  	err = cops->delete(q, cl, extack);
1883  	if (err) {
1884  		kfree_skb(skb);
1885  		return err;
1886  	}
1887  
1888  	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1889  			     n->nlmsg_flags & NLM_F_ECHO);
1890  	return err;
1891  }
1892  
1893  #ifdef CONFIG_NET_CLS
1894  
1895  struct tcf_bind_args {
1896  	struct tcf_walker w;
1897  	unsigned long base;
1898  	unsigned long cl;
1899  	u32 classid;
1900  };
1901  
tcf_node_bind(struct tcf_proto * tp,void * n,struct tcf_walker * arg)1902  static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1903  {
1904  	struct tcf_bind_args *a = (void *)arg;
1905  
1906  	if (tp->ops->bind_class) {
1907  		struct Qdisc *q = tcf_block_q(tp->chain->block);
1908  
1909  		sch_tree_lock(q);
1910  		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1911  		sch_tree_unlock(q);
1912  	}
1913  	return 0;
1914  }
1915  
1916  struct tc_bind_class_args {
1917  	struct qdisc_walker w;
1918  	unsigned long new_cl;
1919  	u32 portid;
1920  	u32 clid;
1921  };
1922  
tc_bind_class_walker(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1923  static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1924  				struct qdisc_walker *w)
1925  {
1926  	struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1927  	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1928  	struct tcf_block *block;
1929  	struct tcf_chain *chain;
1930  
1931  	block = cops->tcf_block(q, cl, NULL);
1932  	if (!block)
1933  		return 0;
1934  	for (chain = tcf_get_next_chain(block, NULL);
1935  	     chain;
1936  	     chain = tcf_get_next_chain(block, chain)) {
1937  		struct tcf_proto *tp;
1938  
1939  		for (tp = tcf_get_next_proto(chain, NULL);
1940  		     tp; tp = tcf_get_next_proto(chain, tp)) {
1941  			struct tcf_bind_args arg = {};
1942  
1943  			arg.w.fn = tcf_node_bind;
1944  			arg.classid = a->clid;
1945  			arg.base = cl;
1946  			arg.cl = a->new_cl;
1947  			tp->ops->walk(tp, &arg.w, true);
1948  		}
1949  	}
1950  
1951  	return 0;
1952  }
1953  
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)1954  static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1955  			   unsigned long new_cl)
1956  {
1957  	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1958  	struct tc_bind_class_args args = {};
1959  
1960  	if (!cops->tcf_block)
1961  		return;
1962  	args.portid = portid;
1963  	args.clid = clid;
1964  	args.new_cl = new_cl;
1965  	args.w.fn = tc_bind_class_walker;
1966  	q->ops->cl_ops->walk(q, &args.w);
1967  }
1968  
1969  #else
1970  
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)1971  static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1972  			   unsigned long new_cl)
1973  {
1974  }
1975  
1976  #endif
1977  
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1978  static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1979  			 struct netlink_ext_ack *extack)
1980  {
1981  	struct net *net = sock_net(skb->sk);
1982  	struct tcmsg *tcm = nlmsg_data(n);
1983  	struct nlattr *tca[TCA_MAX + 1];
1984  	struct net_device *dev;
1985  	struct Qdisc *q = NULL;
1986  	const struct Qdisc_class_ops *cops;
1987  	unsigned long cl = 0;
1988  	unsigned long new_cl;
1989  	u32 portid;
1990  	u32 clid;
1991  	u32 qid;
1992  	int err;
1993  
1994  	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1995  	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1996  		return -EPERM;
1997  
1998  	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1999  				     rtm_tca_policy, extack);
2000  	if (err < 0)
2001  		return err;
2002  
2003  	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2004  	if (!dev)
2005  		return -ENODEV;
2006  
2007  	/*
2008  	   parent == TC_H_UNSPEC - unspecified parent.
2009  	   parent == TC_H_ROOT   - class is root, which has no parent.
2010  	   parent == X:0	 - parent is root class.
2011  	   parent == X:Y	 - parent is a node in hierarchy.
2012  	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
2013  
2014  	   handle == 0:0	 - generate handle from kernel pool.
2015  	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
2016  	   handle == X:Y	 - clear.
2017  	   handle == X:0	 - root class.
2018  	 */
2019  
2020  	/* Step 1. Determine qdisc handle X:0 */
2021  
2022  	portid = tcm->tcm_parent;
2023  	clid = tcm->tcm_handle;
2024  	qid = TC_H_MAJ(clid);
2025  
2026  	if (portid != TC_H_ROOT) {
2027  		u32 qid1 = TC_H_MAJ(portid);
2028  
2029  		if (qid && qid1) {
2030  			/* If both majors are known, they must be identical. */
2031  			if (qid != qid1)
2032  				return -EINVAL;
2033  		} else if (qid1) {
2034  			qid = qid1;
2035  		} else if (qid == 0)
2036  			qid = dev->qdisc->handle;
2037  
2038  		/* Now qid is genuine qdisc handle consistent
2039  		 * both with parent and child.
2040  		 *
2041  		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2042  		 */
2043  		if (portid)
2044  			portid = TC_H_MAKE(qid, portid);
2045  	} else {
2046  		if (qid == 0)
2047  			qid = dev->qdisc->handle;
2048  	}
2049  
2050  	/* OK. Locate qdisc */
2051  	q = qdisc_lookup(dev, qid);
2052  	if (!q)
2053  		return -ENOENT;
2054  
2055  	/* An check that it supports classes */
2056  	cops = q->ops->cl_ops;
2057  	if (cops == NULL)
2058  		return -EINVAL;
2059  
2060  	/* Now try to get class */
2061  	if (clid == 0) {
2062  		if (portid == TC_H_ROOT)
2063  			clid = qid;
2064  	} else
2065  		clid = TC_H_MAKE(qid, clid);
2066  
2067  	if (clid)
2068  		cl = cops->find(q, clid);
2069  
2070  	if (cl == 0) {
2071  		err = -ENOENT;
2072  		if (n->nlmsg_type != RTM_NEWTCLASS ||
2073  		    !(n->nlmsg_flags & NLM_F_CREATE))
2074  			goto out;
2075  	} else {
2076  		switch (n->nlmsg_type) {
2077  		case RTM_NEWTCLASS:
2078  			err = -EEXIST;
2079  			if (n->nlmsg_flags & NLM_F_EXCL)
2080  				goto out;
2081  			break;
2082  		case RTM_DELTCLASS:
2083  			err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2084  			/* Unbind the class with flilters with 0 */
2085  			tc_bind_tclass(q, portid, clid, 0);
2086  			goto out;
2087  		case RTM_GETTCLASS:
2088  			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2089  			goto out;
2090  		default:
2091  			err = -EINVAL;
2092  			goto out;
2093  		}
2094  	}
2095  
2096  	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2097  		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2098  		return -EOPNOTSUPP;
2099  	}
2100  
2101  	new_cl = cl;
2102  	err = -EOPNOTSUPP;
2103  	if (cops->change)
2104  		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2105  	if (err == 0) {
2106  		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2107  		/* We just create a new class, need to do reverse binding. */
2108  		if (cl != new_cl)
2109  			tc_bind_tclass(q, portid, clid, new_cl);
2110  	}
2111  out:
2112  	return err;
2113  }
2114  
2115  struct qdisc_dump_args {
2116  	struct qdisc_walker	w;
2117  	struct sk_buff		*skb;
2118  	struct netlink_callback	*cb;
2119  };
2120  
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)2121  static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2122  			    struct qdisc_walker *arg)
2123  {
2124  	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2125  
2126  	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2127  			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2128  			      RTM_NEWTCLASS);
2129  }
2130  
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2131  static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2132  				struct tcmsg *tcm, struct netlink_callback *cb,
2133  				int *t_p, int s_t)
2134  {
2135  	struct qdisc_dump_args arg;
2136  
2137  	if (tc_qdisc_dump_ignore(q, false) ||
2138  	    *t_p < s_t || !q->ops->cl_ops ||
2139  	    (tcm->tcm_parent &&
2140  	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2141  		(*t_p)++;
2142  		return 0;
2143  	}
2144  	if (*t_p > s_t)
2145  		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2146  	arg.w.fn = qdisc_class_dump;
2147  	arg.skb = skb;
2148  	arg.cb = cb;
2149  	arg.w.stop  = 0;
2150  	arg.w.skip = cb->args[1];
2151  	arg.w.count = 0;
2152  	q->ops->cl_ops->walk(q, &arg.w);
2153  	cb->args[1] = arg.w.count;
2154  	if (arg.w.stop)
2155  		return -1;
2156  	(*t_p)++;
2157  	return 0;
2158  }
2159  
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t,bool recur)2160  static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2161  			       struct tcmsg *tcm, struct netlink_callback *cb,
2162  			       int *t_p, int s_t, bool recur)
2163  {
2164  	struct Qdisc *q;
2165  	int b;
2166  
2167  	if (!root)
2168  		return 0;
2169  
2170  	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2171  		return -1;
2172  
2173  	if (!qdisc_dev(root) || !recur)
2174  		return 0;
2175  
2176  	if (tcm->tcm_parent) {
2177  		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2178  		if (q && q != root &&
2179  		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2180  			return -1;
2181  		return 0;
2182  	}
2183  	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2184  		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2185  			return -1;
2186  	}
2187  
2188  	return 0;
2189  }
2190  
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)2191  static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2192  {
2193  	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2194  	struct net *net = sock_net(skb->sk);
2195  	struct netdev_queue *dev_queue;
2196  	struct net_device *dev;
2197  	int t, s_t;
2198  
2199  	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2200  		return 0;
2201  	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2202  	if (!dev)
2203  		return 0;
2204  
2205  	s_t = cb->args[0];
2206  	t = 0;
2207  
2208  	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
2209  		goto done;
2210  
2211  	dev_queue = dev_ingress_queue(dev);
2212  	if (dev_queue &&
2213  	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2214  				&t, s_t, false) < 0)
2215  		goto done;
2216  
2217  done:
2218  	cb->args[0] = t;
2219  
2220  	dev_put(dev);
2221  	return skb->len;
2222  }
2223  
2224  #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)2225  static int psched_show(struct seq_file *seq, void *v)
2226  {
2227  	seq_printf(seq, "%08x %08x %08x %08x\n",
2228  		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2229  		   1000000,
2230  		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2231  
2232  	return 0;
2233  }
2234  
psched_net_init(struct net * net)2235  static int __net_init psched_net_init(struct net *net)
2236  {
2237  	struct proc_dir_entry *e;
2238  
2239  	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2240  	if (e == NULL)
2241  		return -ENOMEM;
2242  
2243  	return 0;
2244  }
2245  
psched_net_exit(struct net * net)2246  static void __net_exit psched_net_exit(struct net *net)
2247  {
2248  	remove_proc_entry("psched", net->proc_net);
2249  }
2250  #else
psched_net_init(struct net * net)2251  static int __net_init psched_net_init(struct net *net)
2252  {
2253  	return 0;
2254  }
2255  
psched_net_exit(struct net * net)2256  static void __net_exit psched_net_exit(struct net *net)
2257  {
2258  }
2259  #endif
2260  
2261  static struct pernet_operations psched_net_ops = {
2262  	.init = psched_net_init,
2263  	.exit = psched_net_exit,
2264  };
2265  
pktsched_init(void)2266  static int __init pktsched_init(void)
2267  {
2268  	int err;
2269  
2270  	err = register_pernet_subsys(&psched_net_ops);
2271  	if (err) {
2272  		pr_err("pktsched_init: "
2273  		       "cannot initialize per netns operations\n");
2274  		return err;
2275  	}
2276  
2277  	register_qdisc(&pfifo_fast_ops);
2278  	register_qdisc(&pfifo_qdisc_ops);
2279  	register_qdisc(&bfifo_qdisc_ops);
2280  	register_qdisc(&pfifo_head_drop_qdisc_ops);
2281  	register_qdisc(&mq_qdisc_ops);
2282  	register_qdisc(&noqueue_qdisc_ops);
2283  
2284  	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2285  	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2286  	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2287  		      0);
2288  	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2289  	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2290  	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2291  		      0);
2292  
2293  	return 0;
2294  }
2295  
2296  subsys_initcall(pktsched_init);
2297