From e1cc6cfcb99ecd0d41297166c09ffa9d8b7660de Mon Sep 17 00:00:00 2001
From: mengensun <megnensun@tencent.com>
Date: Fri, 23 Feb 2024 17:55:22 +0800
Subject: [PATCH 01/13] net/mbuf: add mbuf for netns

Add mbuf interfaces for netns, latencies larger
than threshhod will be printed to mbuf.

Reviewed-by: kernelxing <kernelxing@tencent.com>
Reviewed-by: yuehongwu <yuehongwu@tencent.com>
Signed-off-by: mengensun <megnensun@tencent.com>
---
 fs/proc/proc_net.c         |  79 ++++++
 kernel/cgroup/mbuf.c       | 123 ++++++++-
 net/Kconfig                |   9 +
 net/core/Makefile          |   1 +
 net/core/netns_mbuf.c      | 346 ++++++++++++++++++++++++
 net/ipv4/Kconfig           |  12 +
 net/ipv4/Makefile          |   1 +
 net/ipv4/af_inet.c         |  12 +
 net/ipv4/netlat.c          | 525 +++++++++++++++++++++++++++++++++++++
 net/ipv4/sysctl_net_ipv4.c |  25 ++
 net/ipv4/tcp.c             |  15 +-
 net/ipv4/tcp_input.c       |  23 ++
 net/ipv4/tcp_output.c      |  50 +++-
 13 files changed, 1216 insertions(+), 5 deletions(-)
 create mode 100644 net/core/netns_mbuf.c
 create mode 100644 net/ipv4/netlat.c

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index be90c3cf67d0..f89b93f62fd1 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -65,6 +65,61 @@ static int seq_open_net(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#ifdef CONFIG_RQM
+/* token from seq_open_net, all is same except the private is
+ * alloc by vmalloc, why?
+ *
+ * sameone may need a big private, wasting continuous phy mem
+ * they can use this function to use vmalloc private
+ *
+ * from now if you using this open abi place write a write
+ * fops like proc_simple_write we delete the pde->write check
+ */
+void *seq_open_net_large_private(struct inode *inode, struct file *file)
+{
+	struct net *net;
+	struct seq_file *seq;
+	struct seq_net_private *p;
+	int ret;
+	unsigned int state_size = PDE(inode)->state_size;
+
+	WARN_ON_ONCE(state_size < sizeof(struct seq_net_private));
+
+	net = get_proc_net(inode);
+	if (!net) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	p = vmalloc(state_size);
+	if (!p) {
+		ret = -ENOMEM;
+		goto put_out;
+	}
+	memset(p, 0, state_size);
+
+	ret = seq_open(file, PDE(inode)->seq_ops);
+	if (ret < 0)
+		goto free_out;
+
+	seq = file->private_data;
+	seq->private = (void *)p;
+
+#ifdef CONFIG_NET_NS
+	p->net = net;
+#endif
+	return p;
+
+free_out:
+	vfree(p);
+put_out:
+	put_net(net);
+out:
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(seq_open_net_large_private);
+#endif
+
 static int seq_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq = f->private_data;
@@ -118,6 +173,30 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(proc_create_net_data);
 
+#ifdef CONFIG_RQM
+/* add a ext-abi to allow someone define the fops by themself, this is all
+ * alike proc_create_net_data except has a extra f_ops parameter
+ */
+struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *seq_ops,
+		unsigned int state_size, void *data,
+		const struct file_operations *f_ops)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+
+	pde_force_lookup(p);
+	p->proc_fops = f_ops;
+	p->seq_ops = seq_ops;
+	p->state_size = state_size;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_data_ops);
+#endif
+
 /**
  * proc_create_net_data_write - Create a writable net_ns-specific proc file
  * @name: The name of the file.
diff --git a/kernel/cgroup/mbuf.c b/kernel/cgroup/mbuf.c
index 6d7cb88273aa..d52206b0c053 100644
--- a/kernel/cgroup/mbuf.c
+++ b/kernel/cgroup/mbuf.c
@@ -21,14 +21,20 @@
 /* Define max mbuf len is 8M, and min is 2M */
 #define MBUF_LEN_MAX (1 << 23)
 #define MBUF_LEN_MIN (1 << 21)
-#define MBUF_LEN_DEF MBUF_LEN_MIN
+/*
+ * from now, every netns has a mbuf, because
+ * change the mbuf slot size is dangerous, so
+ * double the total buffer size to double
+ * total mbuf slot num (see MBUF_SLOTS_DEF)
+ */
+#define MBUF_LEN_DEF (1 << 22)
 
 #define MBUF_MSG_LEN_MAX 1024
 
 /* Monitor buffer support max 1024 items */
 #define MBUF_SLOTS_MAX	1024
 #define MBUF_SLOTS_MIN	256
-#define MBUF_SLOTS_DEF  512
+#define MBUF_SLOTS_DEF  1024
 
 /* Global mbuf metadata struct */
 static struct mbuf_struct g_mbuf = {
@@ -417,3 +423,116 @@ void mbuf_free(struct cgroup *cg)
 	spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags);
 }
 
+static u32 rd_mbuf_next(struct mbuf_ring *mring, u32 curr_idx)
+{
+	struct mbuf_ring_desc *cdesc, *ndesc;
+	u32 frees, next_idx;
+	void *start;
+
+	start = (void *)(mring + 1);
+	cdesc = (struct mbuf_ring_desc *)(start + curr_idx);
+	next_idx = curr_idx + cdesc->len;
+
+	frees = mring->end_idx - next_idx;
+	if (frees < sizeof(struct mbuf_ring_desc)) {
+		/* end */
+		if (next_idx == mring->next_idx)
+			return next_idx;
+
+		/*buffer wrapped to head */
+		next_idx = mring->base_idx;
+		goto next;
+	}
+
+	ndesc = (struct mbuf_ring_desc *)(start + next_idx);
+
+	/* same magic can't be said */
+	if (!ndesc->len && next_idx != mring->next_idx)
+		next_idx = mring->base_idx;
+next:
+	return next_idx;
+}
+
+static ssize_t rd_mbuf_read(struct mbuf_slot *mb, struct mbuf_user_desc *udesc)
+{
+	struct mbuf_ring_desc *desc;
+	ssize_t ret;
+	size_t i, len, tbuf_len;
+	char *start;
+
+	tbuf_len = sizeof(udesc->buf);
+	start = (char *)(mb->mring + 1);
+	desc = (struct mbuf_ring_desc *)(start + udesc->user_idx);
+
+	len = sprintf(udesc->buf, "%llu:", desc->ts_ns);
+	start = (char *)(desc + 1);
+
+	for (i = 0; i < desc->text_len; i++) {
+		unsigned char c = start[i];
+
+		if (c < ' ' || c >= 127 || c == '\\')
+			continue;
+		else
+			udesc->buf[len++] = c;
+		if (len >= tbuf_len)
+			break;
+	}
+
+	len = len >= tbuf_len ? tbuf_len - 1 : len;
+	udesc->buf[len] = '\n';
+	udesc->user_seq++;
+	ret = len;
+	return ret;
+}
+
+/* this ops is just for read-side abi of mbuf, mbuf has a write ops
+ * which is protect by spinlock, while there is no read-write side
+ * protection.
+ * you can use like follow:
+ *
+ * called snapshot_mbuf copy data from mbuf to the `dst`. then read
+ * the dst use the following ops
+ *
+ * all the index is offset from the the end point of mring of the
+ * snapshot, instead of from the global mbuf memory pool
+ *
+ * btw: the private data of seq file is the ideal place to hold the
+ * snapshot
+ */
+const struct mbuf_operations rd_mbuf_ops = {
+	.read		= rd_mbuf_read,
+	.next		= rd_mbuf_next,
+};
+
+void snapshot_mbuf(struct mbuf_slot *dst, struct mbuf_slot *src, seqlock_t *lock)
+{
+	unsigned int seq;
+
+	do {
+		/* the peer of the lock is write-side, we want writer
+		 * go first when there is confliction, and this reader
+		 * retry to read go get a consistent buf snapshot
+		 */
+		cond_resched();
+		seq = read_seqbegin(lock);
+		memcpy((void *)dst, (void *)src, g_mbuf.mbuf_size_per_cg);
+	} while (read_seqretry(lock, seq));
+
+	/* all the ops in `rd_mbuf_ops` see a idx offset from the end
+	 * point of mring. so here adjust the idx as a whole
+	 */
+	dst->mring = (struct mbuf_ring *)(dst + 1);
+	dst->mring->end_idx = dst->mring->end_idx - dst->mring->base_idx;
+	dst->mring->first_idx = dst->mring->first_idx - dst->mring->base_idx;
+	dst->mring->next_idx = dst->mring->next_idx - dst->mring->base_idx;
+	dst->mring->base_idx = 0;
+	dst->ops = &rd_mbuf_ops;
+}
+EXPORT_SYMBOL(snapshot_mbuf);
+
+/* the mbuf size per cg is not changed once the system booted up */
+u32 get_mbuf_slot_len(void)
+{
+	return g_mbuf.mbuf_size_per_cg;
+}
+EXPORT_SYMBOL(get_mbuf_slot_len);
diff --git a/net/Kconfig b/net/Kconfig
index 1d4211802ebd..d45927d9593b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -99,6 +99,15 @@ source "net/netlabel/Kconfig"
 
 endif # if INET
 
+config NETNS_MBUF
+	bool "attach a mbuf to net namespace"
+	depends on RQM && INET && PROC_FS
+	default y
+	---help---
+	  this allows attach a mbuf to each net namespace.
+
+	  if you are unsure how to answer this question, answer N.
+
 config NETWORK_SECMARK
 	bool "Security Marking"
 	help
diff --git a/net/core/Makefile b/net/core/Makefile
index 518cdc0878c9..73c483e23013 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o
 obj-$(CONFIG_GRO_CELLS) += gro_cells.o
 obj-$(CONFIG_FAILOVER) += failover.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
+obj-$(CONFIG_NETNS_MBUF) += netns_mbuf.o
diff --git a/net/core/netns_mbuf.c b/net/core/netns_mbuf.c
new file mode 100644
index 000000000000..08bc7eb6135d
--- /dev/null
+++ b/net/core/netns_mbuf.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* make mbuf can be used by net namespace
+ *
+ * Author: mengensun <mengensun@tencent.com>
+ * Author: yuehongwu <yuehongwu@tencent.com>
+ * Copyright (C) 2024 Tencent, Inc
+ */
+
+#include<linux/mbuf.h>
+#include<linux/proc_fs.h>
+
+#include<net/net_namespace.h>
+#include<net/netns/generic.h>
+
+struct net_mbuf_data {
+	struct proc_dir_entry *twatcher;
+	struct proc_dir_entry *log;
+	/* used between write side and read side, read side can
+	 * be delayed, read side can't be delayed
+	 */
+	seqlock_t lock;
+	/* this ugly, while i have no idea how to deal the kabi
+	 * because the kabi of mbuf have a cgroup parameter
+	 * !!this is not really a cgroup, just a struct cgroup,
+	 * make kabi and mbuf happy
+	 *
+	 * when next kabi version is planned, here can:
+	 * 1. add a pointer type of struct mbuf_slot, to struct
+	 * net
+	 * 2. and change the mbuf kabi using the mbuf_slot as the
+	 * parameter insteading using cgroup
+	 * 3. delete those code using net_generic just using the
+	 * pointer in struct net directly
+	 */
+	unsigned char cgroup[0];
+};
+
+struct mbuf_seq_data {
+	struct seq_net_private snp;
+	struct mbuf_user_desc udesc;
+	struct mbuf_slot snapshot[0];
+};
+
+unsigned int net_mbuf_id __read_mostly;
+static struct mbuf_slot *get_net_mbuf(struct net *net)
+{
+	struct net_mbuf_data *pdata;
+
+	pdata = net_generic(net, net_mbuf_id);
+	return ((struct cgroup *)pdata->cgroup)->mbuf;
+}
+
+/* not controlled by sysctl_qos_mbuf_enable because we will
+ * have a /proc/net/ipv4/netlat/enable in later patch
+ */
+ssize_t net_mbuf_print(struct net *net, const char *fmt, ...)
+{
+	struct mbuf_slot *mb;
+	struct cgroup *cg;
+	va_list args;
+	struct net_mbuf_data *pdata;
+
+	pdata = net_generic(net, net_mbuf_id);
+
+	cg = (struct cgroup *)(pdata->cgroup);
+	mb = cg->mbuf;
+
+	if (!mb)
+		goto out;
+
+	if (!__ratelimit(&mb->ratelimit))
+		goto out;
+
+	if (mb->ops) {
+		va_start(args, fmt);
+		write_seqlock(&pdata->lock);
+		mb->ops->write(cg, fmt, args);
+		write_sequnlock(&pdata->lock);
+		va_end(args);
+	}
+out:
+	return 0;
+}
+EXPORT_SYMBOL(net_mbuf_print);
+
+/* udesc is the user side interface, used to get data from mbuf,
+ * we can alloc a udesc per user, not to alloc a udesc and bind
+ * to mbuf when user accessing mbuf.
+ *
+ * seq file private data is the ideal place to hold the udesc
+ * if we put udesc in seq file private data all things is simple
+ */
+static void *netns_mbuf_start(struct seq_file *s, loff_t *pos)
+{
+	u32 index;
+	struct mbuf_user_desc *udesc;
+	struct mbuf_seq_data *pd;
+
+	pd = s->private;
+	udesc = &pd->udesc;
+	index = *pos;
+
+	/* why: see seq_mbuf_open */
+	if (!pd->snapshot->mring)
+		return NULL;
+
+	/* If already reach end, just return */
+	if (index && index == pd->snapshot->mring->next_idx)
+		return NULL;
+
+	udesc->user_idx = pd->snapshot->mring->first_idx;
+	udesc->user_seq = pd->snapshot->mring->first_seq;
+
+	/* Maybe reach end or empty */
+	if (udesc->user_idx == pd->snapshot->mring->next_idx)
+		return NULL;
+	return udesc;
+}
+
+static void *netns_mbuf_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct mbuf_seq_data *pd;
+	struct mbuf_user_desc *udesc = v;
+
+	pd = s->private;
+
+	/* why: see seq_mbuf_open */
+	if (!pd->snapshot->mring)
+		return NULL;
+
+	udesc->user_idx = pd->snapshot->ops->next(pd->snapshot->mring,
+			udesc->user_idx);
+	*pos = udesc->user_idx;
+	if (udesc->user_idx == pd->snapshot->mring->next_idx)
+		return NULL;
+
+	return udesc;
+}
+
+static void netns_mbuf_stop(struct seq_file *s, void *v) { }
+
+static int netns_mbuf_show(struct seq_file *s, void *v)
+{
+	ssize_t ret;
+	struct mbuf_seq_data *pd;
+	struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v;
+
+	pd = s->private;
+
+	/* why: see seq_mbuf_open */
+	if (!pd->snapshot->mring)
+		return 0;
+
+	memset(udesc->buf, 0, sizeof(udesc->buf));
+	ret = pd->snapshot->ops->read(pd->snapshot, udesc);
+	if (ret > 0)
+		seq_printf(s, "%s", udesc->buf);
+	return 0;
+}
+
+extern void *seq_open_net_large_private(struct inode *, struct file*);
+extern void *snapshot_mbuf(struct mbuf_slot *dst, struct mbuf_slot *src, seqlock_t *mbuf_lock);
+static int seq_mbuf_open(struct inode *inode, struct file *file)
+{
+	struct net_mbuf_data *pd;
+	struct mbuf_seq_data *p;
+	struct mbuf_slot *mbuf;
+
+	p = seq_open_net_large_private(inode, file);
+
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	mbuf = get_net_mbuf(p->snp.net);
+	/* netns may have no mbuf attached, because the mbuf
+	 * pool has a max num
+	 * here we let file open success, so, seq_ops must
+	 * check mring point
+	 *
+	 * btw: we memzerod the private in
+	 * seq_open_net_large_private
+	 */
+	if (!mbuf)
+		return 0;
+
+	pd = net_generic(p->snp.net, net_mbuf_id);
+	snapshot_mbuf(p->snapshot, mbuf, &pd->lock);
+	return 0;
+}
+
+/* this function is token from seq_release_net, all is the
+ * same except for using **vfree** to free the private
+ */
+static int seq_mbuf_release(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq = f->private_data;
+
+	put_net(seq_file_net(seq));
+	vfree(seq->private);
+	seq->private = NULL;
+	seq_release(ino, f);
+	return 0;
+}
+
+/* when write clear the data */
+ssize_t seq_mbuf_write(struct file *f, const char __user *ubuf,
+		size_t size, loff_t *_pos)
+{
+	struct seq_file *seq = f->private_data;
+	struct mbuf_seq_data *p;
+	struct net_mbuf_data *pd;
+	struct mbuf_slot *mb;
+
+	p = seq->private;
+	pd = net_generic(p->snp.net, net_mbuf_id);
+	mb = get_net_mbuf(p->snp.net);
+
+	/* the netns not attached mbuf */
+	if (!mb)
+		return size;
+
+	/* reset the mbuf to clear all the data */
+	write_seqlock(&pd->lock);
+	mb->mring->first_idx = mb->mring->base_idx;
+	mb->mring->first_seq = 0;
+	mb->mring->next_idx = mb->mring->base_idx;
+	mb->mring->next_seq = 0;
+	write_sequnlock(&pd->lock);
+	return size;
+}
+
+/* seq_read have a mutex lock hold when called thoes function
+ * while the mutex lock is bind to struct file, not to inode,
+ * that mutex lock can control mutex access to mbuf among tasks
+ * which have the same file object (eg: muti-threads of
+ * a process)
+ *
+ * if there are muti-process access the mbuf, there have no
+ * mutex accessing.
+ */
+static const struct seq_operations mbuf_seq_ops = {
+	.show = netns_mbuf_show,
+	.start = netns_mbuf_start,
+	.next = netns_mbuf_next,
+	.stop = netns_mbuf_stop,
+};
+
+static const struct file_operations mbuf_seq_fops = {
+	.open		= seq_mbuf_open,
+	.read		= seq_read,
+	.write		= seq_mbuf_write,
+	.llseek		= seq_lseek,
+	.release	= seq_mbuf_release,
+};
+
+extern struct proc_dir_entry *proc_create_net_data_ops(const char *name,
+		umode_t mode, struct proc_dir_entry *parent,
+		const struct seq_operations *seq_ops,
+		unsigned int state_size, void *data,
+		const struct file_operations *fops);
+extern u32 get_mbuf_slot_len(void);
+static int __net_init net_mbuf_init(struct net *net)
+{
+	int ret;
+	struct net_mbuf_data *p;
+	struct mbuf_slot *mbuf;
+
+	ret = 0;
+	p = net_generic(net, net_mbuf_id);
+
+	/* if mbuf alloc failed, make the netns create success
+	 *
+	 * returning error here will put a limit on max netns
+	 * can be created on current system
+	 *
+	 * btw: mbuf_slot has a max num 1024 for now, if mbuf_slot
+	 * is all used, more allocing may failed, what we can do
+	 * is make usr interface not changed, and make netlat
+	 * `speak nothing`
+	 * cgroup is used for kabi
+	 */
+	seqlock_init(&p->lock);
+	mbuf = mbuf_slot_alloc((struct cgroup *)(p->cgroup));
+	if (!mbuf)
+		pr_err("fail alloc mbuf");
+	((struct cgroup *)p->cgroup)->mbuf = mbuf;
+
+	p->twatcher = proc_net_mkdir(net, "twatcher", net->proc_net);
+	if (!p->twatcher) {
+		ret = -ENOMEM;
+		goto free_mbuf;
+	}
+
+	p->log = proc_create_net_data_ops("log", S_IFREG | 0644, p->twatcher,
+			&mbuf_seq_ops,
+			sizeof(struct mbuf_seq_data) + get_mbuf_slot_len(),
+			NULL, &mbuf_seq_fops);
+	if (!p->log) {
+		ret = -ENOMEM;
+		goto remove_watcher;
+	}
+	return ret;
+
+remove_watcher:
+	remove_proc_entry("twatcher", net->proc_net);
+
+free_mbuf:
+	if (mbuf)
+		mbuf_free((struct cgroup *)(p->cgroup));
+	return ret;
+}
+
+static void __net_exit net_mbuf_exit(struct net *net)
+{
+	struct net_mbuf_data *pdata;
+
+	pdata = net_generic(net, net_mbuf_id);
+
+	remove_proc_entry("log", pdata->twatcher);
+	remove_proc_entry("twatcher", net->proc_net);
+
+	/* if mbuf allocte failed, no need to free*/
+	if (!((struct cgroup *)pdata->cgroup)->mbuf)
+		return;
+	mbuf_free((struct cgroup *)(pdata->cgroup));
+}
+
+static struct pernet_operations net_mbuf_ops = {
+	.init = net_mbuf_init,
+	.exit = net_mbuf_exit,
+	.id   = &net_mbuf_id,
+	/* for kabi */
+	.size = sizeof(struct net_mbuf_data) + sizeof(struct cgroup),
+};
+
+int  inet_mbuf_init(void)
+{
+	return register_pernet_subsys(&net_mbuf_ops);
+}
+EXPORT_SYMBOL(inet_mbuf_init);
+
+void inet_mbuf_exit(void)
+{
+	unregister_pernet_subsys(&net_mbuf_ops);
+}
+EXPORT_SYMBOL(inet_mbuf_exit);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index f5af8c6b2f87..3d305a02fef8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -432,6 +432,18 @@ config INET_UDP_DIAG
 	  Support for UDP socket monitoring interface used by the ss tool.
 	  If unsure, say Y.
 
+config NETLAT
+	bool "INET: allow collect netlat info"
+	depends on NETNS_MBUF
+	default y
+	---help---
+	  enable some hook in net stack to collector some latency info. if
+	  the lentency is bigger then configured by user interface, then
+	  print msg to mbuf.
+
+	  if unsure, say N.
+
+
 config INET_RAW_DIAG
 	tristate "RAW: socket monitoring interface"
 	depends on INET_DIAG && (IPV6 || IPV6=n)
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9e1a186a3671..921a5c8186ef 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
 obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_NETLAT) += netlat.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 36405511052a..26d35bca5436 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -2075,6 +2075,10 @@ fs_initcall(inet_init);
 /* ------------------------------------------------------------------------ */
 
 #ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NETNS_MBUF
+extern int  inet_mbuf_init(void);
+extern void inet_mbuf_exit(void);
+#endif
 static int __init ipv4_proc_init(void)
 {
 	int rc = 0;
@@ -2087,11 +2091,19 @@ static int __init ipv4_proc_init(void)
 		goto out_udp;
 	if (ping_proc_init())
 		goto out_ping;
+#ifdef CONFIG_NETNS_MBUF
+	if (inet_mbuf_init())
+		goto out_mbuf;
+#endif
 	if (ip_misc_proc_init())
 		goto out_misc;
 out:
 	return rc;
 out_misc:
+#ifdef CONFIG_NETNS_MBUF
+	inet_mbuf_exit();
+out_mbuf:
+#endif
 	ping_proc_exit();
 out_ping:
 	udp4_proc_exit();
diff --git a/net/ipv4/netlat.c b/net/ipv4/netlat.c
new file mode 100644
index 000000000000..e256f4e4fe81
--- /dev/null
+++ b/net/ipv4/netlat.c
@@ -0,0 +1,525 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Net Latency Monitor base on Quality Monitor Buffer
+ * Aim to provide net latency for a long running system
+ *
+ *      Author: mengensun <mengensun@tencent.com>
+ *      Author: yuehongwu <yuehongwu@tencent.com>
+ *      Copyright (C) 2024 Tencent, Inc
+ */
+
+#include<net/net_namespace.h>
+#include<net/tcp.h>
+#include<net/netns/generic.h>
+extern int net_mbuf_print(struct net *net, const char *fmt, ...);
+
+struct netlat_net_data {
+	int ack;
+	int pick;
+	int queue;
+	int enable;
+	unsigned long *ports;
+	struct ctl_table_header *netlat_hdr;
+};
+
+static unsigned int netlat_net_id __read_mostly;
+DEFINE_STATIC_KEY_FALSE(enable_netlat);
+
+static inline int get_ack_lat(struct net *net)
+{
+	struct netlat_net_data *pdata;
+
+	pdata = net_generic(net, netlat_net_id);
+	return pdata->ack;
+}
+
+static inline int get_pick_lat(struct net *net)
+{
+	struct netlat_net_data *pdata;
+
+	pdata = net_generic(net, netlat_net_id);
+	return pdata->pick;
+}
+
+static inline int get_queue_lat(struct net *net)
+{
+	struct netlat_net_data *pdata;
+
+	pdata = net_generic(net, netlat_net_id);
+	return pdata->queue;
+}
+
+static inline long *get_net_ports(struct net *net)
+{
+	struct netlat_net_data *pdata;
+
+	pdata = net_generic(net, netlat_net_id);
+	return pdata->ports;
+}
+
+/* this function is only can be used with skb on rtx queue
+ * because the skb on rtx queue is never be transmit down
+ * so the ack_seq is not used for all the skb on trx queue
+ * if we add a field in skb, the kapi is changed, we need a
+ * delt time from `skb enqueue to rtx queue` to `skb dequeue
+ * from rtx queue`, because all the current field about
+ * timestamp is reflesh when skb is restransmitted, we can
+ * not use thoese field, we borrow the ack_seq to record the
+ * time when skb enqueue to rtx queue.
+ *
+ * !! in next version allow change the kabi, please add a
+ * field in skb, and change the follow thress function to
+ * using the new added field.
+ * borrow the ack_seq is so trick!!
+ */
+static inline u32 get_rtxq_skb_jiffies(struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->ack_seq;
+}
+
+static inline void set_rtxq_skb_jiffies(struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->ack_seq = tcp_jiffies32;
+}
+
+/* sk is not used for now, but, may be used in the future
+ */
+void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst,
+				struct sk_buff *src)
+{
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+	TCP_SKB_CB(dst)->ack_seq = TCP_SKB_CB(src)->ack_seq;
+}
+EXPORT_SYMBOL(netlat_copy_rtxq_skb);
+
+static inline u32 tcp_jiffies32_delt(struct sk_buff *skb)
+{
+	u32 j1, j2;
+
+	j1 = tcp_jiffies32;
+	j2 = get_rtxq_skb_jiffies(skb);
+
+	/* here leave a small time windows
+	* when skb is alloced ack_num is inited to 0
+	* if we do not touch the time stamp in ack_num
+	* it is zero
+	*/
+	if (!j2)
+		return 0;
+
+	if (likely(j1 >= j2))
+		return j1 - j2;
+	/* when u32 is wrap around */
+	return U32_MAX - (j2 - j1) + 1;
+}
+
+/* sk is not used for now, but, may be used in the future
+ */
+void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb)
+{
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+	set_rtxq_skb_jiffies(skb);
+}
+EXPORT_SYMBOL(netlat_tcp_enrtxqueue);
+
+/* print msg to per net mbuf when ack latency is
+ * watched
+ */
+void netlat_ack_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct net *net;
+	s64 thresh;
+	s64 lat;
+	long *ports;
+
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+
+	net = sock_net(sk);
+
+	thresh = get_ack_lat(net);
+	if (!thresh)
+		return;
+
+	lat = tcp_jiffies32_delt(skb);
+	if (lat < thresh)
+		return;
+
+	ports = get_net_ports(net);
+	if (!test_bit(sk->sk_num, ports))
+		return;
+
+	net_mbuf_print(net, "TCP AC %u %pI4 %d %pI4 %d\n",
+			(unsigned int)(jiffies_to_msecs(lat)),
+			&sk->sk_rcv_saddr, (int)sk->sk_num,
+			&sk->sk_daddr, (int)ntohs(sk->sk_dport));
+}
+EXPORT_SYMBOL(netlat_ack_check);
+
+/* netlat/enable only can be seen in root netns
+ *
+ * following three function must be called after lock
+ * the `lock` above we follow the following rule
+ *
+ * 1. when disable `enable`: if we have opened the
+ *    net_timestamp, closed it
+ *
+ * 2. when enable `enable`: if `pick/queue` need
+ *    net_timestamp, enabled it
+ *
+ * 3. when `pick/queue` are writing and need enable
+ *    net_timestamp and if `enable` disabled, just
+ *    say `i need net_timestamp` and do nothing leaveing
+ *    it to 2 above
+ *
+ * 4. when `pick/queue` are writing and need enable
+ *    net_timestamp and if `enable` enabled, just
+ *    enable net_timestamp by themself
+ */
+static struct mutex lock = __MUTEX_INITIALIZER(lock);
+static unsigned long need_time_stamp;
+
+/* for pick/queue write: see comment above */
+static void handle_net_timestamp(bool closed)
+{
+	/*!0->0*/
+	if (closed) {
+		need_time_stamp--;
+		if (need_time_stamp == 0 &&
+				static_branch_unlikely(&enable_netlat))
+			net_disable_timestamp();
+		return;
+	}
+
+	/*0->!0*/
+	need_time_stamp++;
+	if (need_time_stamp == 1 &&
+			static_branch_unlikely(&enable_netlat))
+		net_enable_timestamp();
+}
+
+/* for enable write: see comment above */
+static void handle_netlat_enable(bool closed)
+{
+	/*!0->0*/
+	if (closed) {
+		if (need_time_stamp)
+			net_disable_timestamp();
+		static_branch_disable(&enable_netlat);
+		return;
+	}
+
+	/*0->!0*/
+	if (need_time_stamp)
+		net_enable_timestamp();
+	static_branch_enable(&enable_netlat);
+}
+
+/* for netns exits: see comment above */
+static void handle_net_timestamp_exit(bool queue, bool pick)
+{
+	need_time_stamp -= queue;
+	need_time_stamp -= pick;
+
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+	/*
+	 * if we dec the counter to zero and netlat enabled
+	 * disable the timestamp
+	 */
+	if (!need_time_stamp && (queue || pick))
+		net_disable_timestamp();
+}
+
+static int proc_do_netlat_pick(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int prev;
+	int ret;
+	struct netlat_net_data *pdata;
+
+	mutex_lock(&lock);
+
+	pdata = container_of(table->data, struct netlat_net_data, pick);
+	prev = pdata->pick;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	/* only change timestamp from 0->!0 or !0->0 */
+	if (!!prev == !!pdata->pick)
+		goto unlock;
+	handle_net_timestamp(!!prev);
+
+unlock:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static int proc_do_netlat_queue(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int prev;
+	int ret;
+	struct netlat_net_data *pdata;
+
+	mutex_lock(&lock);
+	pdata = container_of(table->data, struct netlat_net_data, queue);
+	prev = pdata->queue;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	/* only change timestamp from 0->!0 or !0->0 */
+	if (!!prev == !!pdata->queue)
+		goto unlock;
+	handle_net_timestamp(!!prev);
+
+unlock:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static int proc_do_netlat_enable(struct ctl_table *table, int write,
+					void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int prev;
+	int ret;
+	struct netlat_net_data *pdata;
+
+	mutex_lock(&lock);
+
+	pdata = container_of(table->data, struct netlat_net_data, enable);
+	prev = pdata->enable;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (!!prev == !!pdata->enable)
+		goto unlock;
+	handle_netlat_enable(!!prev);
+
+unlock:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct ctl_table ipv4_netlat[] = {
+	{
+		.procname	= "lports",
+		.data		= NULL,
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
+	{
+		.procname	= "ack",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+	{
+		.procname	= "queue",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_netlat_queue,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+	{
+		.procname	= "pick",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_netlat_pick,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+	{
+		.procname	= "enable",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_netlat_enable,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{}
+};
+
+static int netlat_init_ipv4_ctl_table(struct net *net)
+{
+	int ret;
+	struct netlat_net_data *pdata;
+	struct ctl_table *table;
+
+	table = ipv4_netlat;
+	pdata = net_generic(net, netlat_net_id);
+
+	ret = 0;
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(table, sizeof(ipv4_netlat), GFP_KERNEL);
+		if (!table) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		/* do not export enable to son netns */
+		memset(&table[4], 0, sizeof(struct ctl_table));
+	}
+
+	pdata->ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!pdata->ports) {
+		ret = -ENOMEM;
+		goto free_table;
+	}
+
+	table[0].data = &pdata->ports;
+	table[1].data = &pdata->ack;
+	table[2].data = &pdata->queue;
+	table[3].data = &pdata->pick;
+
+	/* do not export enable to son netns*/
+	if (net_eq(net, &init_net))
+		table[4].data = &pdata->enable;
+
+	pdata->netlat_hdr = register_net_sysctl(net, "net/ipv4/netlat", table);
+	if (!pdata->netlat_hdr) {
+		ret = -ENOMEM;
+		goto free_ports;
+	}
+	return ret;
+
+free_ports:
+	kfree(pdata->ports);
+free_table:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+out:
+	return ret;
+}
+
+static void netlat_exit_ipv4_ctl_table(struct net *net)
+{
+	struct netlat_net_data *pdata;
+	struct ctl_table *table;
+
+	pdata = net_generic(net, netlat_net_id);
+
+	table = pdata->netlat_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(pdata->netlat_hdr);
+
+	/* root netns never exit*/
+	if (net_eq(net, &init_net))
+		return;
+
+	mutex_lock(&lock);
+	handle_net_timestamp_exit(!!pdata->queue, !!pdata->pick);
+	mutex_unlock(&lock);
+
+	kfree(table);
+	kfree(pdata->ports);
+}
+
+/* print msg to per net mbuf when latency from
+ * netif to queued on tcp receive queue
+ */
+void netlat_queue_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct net *net;
+	s64 lat;
+	int thresh;
+	long *ports;
+
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+
+	net = sock_net(sk);
+	if (!skb->tstamp)
+		return;
+
+	thresh = get_queue_lat(net);
+	if (!thresh)
+		return;
+
+	ports = get_net_ports(net);
+	if (!test_bit(sk->sk_num, ports))
+		return;
+
+	if (!skb->tstamp)
+		return;
+
+	lat = ktime_to_ms(net_timedelta(skb->tstamp));
+	lat = lat < 0 ? 0 : lat;
+	if (lat < thresh)
+		return;
+
+	net_mbuf_print(net, "TCP QU %u %pI4 %d %pI4 %d\n",
+			(unsigned int)lat,
+			&sk->sk_rcv_saddr, (int)sk->sk_num,
+			&sk->sk_daddr, (int)ntohs(sk->sk_dport));
+}
+EXPORT_SYMBOL(netlat_queue_check);
+
+/* print msg to per net mbuf when latency from
+ * netif to pick by usr app
+ */
+void netlat_pick_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct net *net;
+	s64 lat;
+	int thresh;
+	long *ports;
+
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+
+	net = sock_net(sk);
+	if (!skb->tstamp)
+		return;
+
+	thresh = get_pick_lat(net);
+	if (!thresh)
+		return;
+
+	ports = get_net_ports(net);
+	if (!test_bit(sk->sk_num, ports))
+		return;
+
+	if (!skb->tstamp)
+		return;
+
+	lat = ktime_to_ms(net_timedelta(skb->tstamp));
+	lat = lat < 0 ? 0 : lat;
+	if (lat < thresh)
+		return;
+
+	net_mbuf_print(net, "TCP PI %u %pI4 %d %pI4 %d\n",
+			(unsigned int)lat, &sk->sk_rcv_saddr, (int)sk->sk_num,
+			&sk->sk_daddr, (int)ntohs(sk->sk_dport));
+}
+EXPORT_SYMBOL(netlat_pick_check);
+
+static struct pernet_operations netlat_net_ops = {
+	.init = netlat_init_ipv4_ctl_table,
+	.exit = netlat_exit_ipv4_ctl_table,
+	.id   = &netlat_net_id,
+	.size = sizeof(struct netlat_net_data),
+};
+
+/* add some config file in proc
+ */
+int  netlat_net_init(void)
+{
+	return register_pernet_subsys(&netlat_net_ops);
+}
+EXPORT_SYMBOL(netlat_net_init);
+
+void netlat_net_exit(void)
+{
+	unregister_pernet_subsys(&netlat_net_ops);
+}
+EXPORT_SYMBOL(netlat_net_exit);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d1253a5359f3..8732a1712bf8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1385,6 +1385,14 @@ static struct ctl_table ipv4_net_table[] = {
 	{ }
 };
 
+#ifdef CONFIG_NETLAT
+extern int  netlat_net_init(void);
+/*
+ * this is not used for now, but sameone may used laterly
+ * just put here
+ */
+extern int  netlat_net_exit(void);
+#endif
 static __net_init int ipv4_sysctl_init_net(struct net *net)
 {
 	struct ctl_table *table;
@@ -1449,6 +1457,23 @@ static __init int sysctl_ipv4_init(void)
 		return -ENOMEM;
 	}
 
+#ifdef CONFIG_NETLAT
+	/*
+	 * this must after the register of ipv4_sysctl_ops
+	 * because we are on the sub-tree of "net/ipv4"
+	 * setup_net call init one by one, while cleanup_net
+	 * call init one by one in reversed order
+	 */
+	if (netlat_net_init()) {
+		unregister_pernet_subsys(&ipv4_sysctl_ops);
+		unregister_net_sysctl_table(hdr);
+		return -ENOMEM;
+	}
+#endif
+	/*
+	 * btw: if someone adding some code here, do not
+	 * forget the !!netlat_net_exit!! function
+	 */
 	return 0;
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4063356a0c40..9b60cdaa8d93 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1953,6 +1953,9 @@ static int tcp_inq_hint(struct sock *sk)
 	return inq;
 }
 
+#ifdef CONFIG_NETLAT
+extern void netlat_pick_check(struct sock *sk, struct sk_buff *skb);
+#endif
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -2179,15 +2182,23 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			goto found_fin_ok;
-		if (!(flags & MSG_PEEK))
+		if (!(flags & MSG_PEEK)) {
+#ifdef CONFIG_NETLAT
+			netlat_pick_check(sk, skb);
+#endif
 			sk_eat_skb(sk, skb);
+		}
 		continue;
 
 found_fin_ok:
 		/* Process the FIN. */
 		WRITE_ONCE(*seq, *seq + 1);
-		if (!(flags & MSG_PEEK))
+		if (!(flags & MSG_PEEK)) {
+#ifdef CONFIG_NETLAT
+			netlat_pick_check(sk, skb);
+#endif
 			sk_eat_skb(sk, skb);
+		}
 		break;
 	} while (len > 0);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9a2fd0cd68a2..014b9e8bc4c4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3103,6 +3103,10 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 	}
 }
 
+#ifdef CONFIG_NETLAT
+extern void netlat_ack_check(struct sock *sk, struct sk_buff *skb);
+extern void netlat_queue_check(struct sock *sk, struct sk_buff *skb);
+#endif
 /* Remove acknowledged frames from the retransmission queue. If our packet
  * is before the ack sequence we can discard it as it's confirmed to have
  * arrived at the other end.
@@ -3124,6 +3128,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 	u32 pkts_acked = 0;
 	u32 last_in_flight = 0;
 	bool rtt_update;
+	bool __maybe_unused netlat_oldest = true;
 	int flag = 0;
 
 	first_ackt = 0;
@@ -3205,6 +3210,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		if (unlikely(skb == tp->lost_skb_hint))
 			tp->lost_skb_hint = NULL;
 		tcp_highest_sack_replace(sk, skb, next);
+#ifdef CONFIG_NETLAT
+		/*
+		 * here in for! we have make the ts of skb in rtx queue
+		 * Monotonically incremental with the seq_num of skb,so
+		 * here we can only report the oldest skb's latency.
+		 *
+		 * btw: the oldest skb's latency is the max latency see
+		 * by this function
+		 */
+		if (netlat_oldest) {
+			netlat_ack_check(sk, skb);
+			netlat_oldest = false;
+		}
+#endif
 		tcp_rtx_queue_unlink_and_free(skb, sk);
 	}
 
@@ -4764,6 +4783,10 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
 	int eaten;
 	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
 
+#ifdef CONFIG_NETLAT
+	netlat_queue_check(sk, skb);
+#endif
+
 	eaten = (tail &&
 		 tcp_try_coalesce(sk, tail,
 				  skb, fragstolen)) ? 1 : 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b8dd6d7cc91e..c823f4ac218b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -48,6 +48,11 @@
 
 #include <trace/events/tcp.h>
 
+#ifdef CONFIG_NETLAT
+extern void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src);
+extern void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb);
+#endif
+
 /* Refresh clocks of a TCP socket,
  * ensuring monotically increasing values.
  */
@@ -72,6 +77,12 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
 
 	__skb_unlink(skb, &sk->sk_write_queue);
+#ifdef CONFIG_NETLAT
+	/*
+	 * for the tcp in established status, and normal data skb
+	 */
+	netlat_tcp_enrtxqueue(sk, skb);
+#endif
 	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
 
 	if (tp->highest_sack == NULL)
@@ -1505,9 +1516,28 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
 	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
-	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
+	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) {
+#ifdef CONFIG_NETLAT
+		/*
+		 * for skb in rtx queue and be splitted:
+		 *
+		 * eg: we receive an ack, the ack only partially
+		 * acked an skb in rtx queue, we need split the
+		 * partially acked skb, release the acked bytes
+		 * and collect the remained bytes to `buff`, insert
+		 * buff to rtx queue again
+		 * some other condition like:
+		 * partially sacked a skb in rtx queue
+		 * partially dsacked a skb in rtx queue
+		 * ....
+		 * here we should copy the origin skb's ts to the
+		 * new one(buff)
+		 */
+		netlat_copy_rtxq_skb(sk, buff, skb);
+#endif
 		list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
+	}
 	return 0;
 }
 
@@ -3386,6 +3416,14 @@ int tcp_send_synack(struct sock *sk)
 			tcp_highest_sack_replace(sk, skb, nskb);
 			tcp_rtx_queue_unlink_and_free(skb, sk);
 			__skb_header_release(nskb);
+#ifdef CONFIG_NETLAT
+			/*
+			 * for crossed SYN-ACK, eg: we are in
+			 * syn-send status, and received a pure
+			 * syn skb from the peer
+			 */
+			netlat_tcp_enrtxqueue(sk, nskb);
+#endif
 			tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
 			sk_wmem_queued_add(sk, nskb->truesize);
 			sk_mem_charge(sk, nskb->truesize);
@@ -3694,6 +3732,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
 	if (!err) {
 		tp->syn_data = (fo->copied > 0);
+#ifdef CONFIG_NETLAT
+		/*
+		 * for fastopen sock which send data in the syn skb
+		 */
+		netlat_tcp_enrtxqueue(sk, syn_data);
+#endif
 		tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
 		goto done;
@@ -3745,6 +3789,10 @@ int tcp_connect(struct sock *sk)
 	tp->retrans_stamp = tcp_time_stamp(tp);
 	tcp_connect_queue_skb(sk, buff);
 	tcp_ecn_send_syn(sk, buff);
+#ifdef CONFIG_NETLAT
+	/* for the syn package */
+	netlat_tcp_enrtxqueue(sk, buff);
+#endif
 	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
 
 	/* Send off SYN; include data in Fast Open. */
-- 
Gitee


From a793ae01f049c8b16007083ea6c6dd0dea54ce97 Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Mon, 29 Jan 2024 17:03:14 +0800
Subject: [PATCH 02/13] sli: add sli related items in the subdirectory of each
 cgroup

The existing sli framework cannot modify the monitoring thresholds of
subdirectories other than cpuacct cgroup. To fix this, add
sli.control/sli.monitor/mbuf in the subdirectory of each cgroup
and traverse all cgroups on each update tick.

Reviewed-by: Bin Lai <robinlai@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 include/linux/sli.h    |   2 +
 kernel/cgroup/sli.c    | 188 +++++++++++++++++++++++++++++++----------
 kernel/sched/cpuacct.c |   2 +-
 mm/memcontrol.c        |  24 ++++++
 4 files changed, 169 insertions(+), 47 deletions(-)

diff --git a/include/linux/sli.h b/include/linux/sli.h
index 32c901b87e9b..ae8ecd126697 100755
--- a/include/linux/sli.h
+++ b/include/linux/sli.h
@@ -132,6 +132,8 @@ int  sli_schedlat_max_show(struct seq_file *m, struct cgroup *cgrp);
 ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off);
 int cgroup_sli_control_show(struct seq_file *sf, void *v);
+int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v);
+int mem_cgroup_sli_control_show(struct seq_file *sf, void *v);
 void sli_check_longsys(struct task_struct *tsk);
 void sli_update_tick(struct task_struct *tsk);
 
diff --git a/kernel/cgroup/sli.c b/kernel/cgroup/sli.c
index faa4756b73e5..2e383e9180e2 100755
--- a/kernel/cgroup/sli.c
+++ b/kernel/cgroup/sli.c
@@ -182,13 +182,12 @@ static int sli_event_inherit(struct cgroup *cgrp)
 }
 
 static void store_task_stack(struct task_struct *task, char *reason,
-			     u64 duration, unsigned int skipnr)
+			     u64 duration, unsigned int skipnr, struct cgroup *cgrp)
 {
 	unsigned long *entries;
 	unsigned nr_entries = 0;
 	unsigned long flags;
 	int i;
-	struct cgroup *cgrp;
 
 	entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
 				GFP_ATOMIC);
@@ -197,7 +196,6 @@ static void store_task_stack(struct task_struct *task, char *reason,
 
 	nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, skipnr);
 
-	cgrp = get_cgroup_from_task(task);
 	spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags);
 
 	mbuf_print(cgrp, "record reason:%s comm:%s pid:%d duration=%lld\n",
@@ -428,7 +426,7 @@ void sli_memlat_stat_end(enum sli_memlat_stat_item sidx, u64 start)
 				char *lat_name;
 
 				lat_name = get_memlat_name(sidx);
-				store_task_stack(current, lat_name, duration, 0);
+				store_task_stack(current, lat_name, duration, 0, cgrp);
 			}
 		}
 	}
@@ -465,7 +463,7 @@ void sli_schedlat_stat(struct task_struct *task, enum sli_schedlat_stat_item sid
 				char *lat_name;
 
 				lat_name = get_schedlat_name(sidx);
-				store_task_stack(task, lat_name, delta, 0);
+				store_task_stack(task, lat_name, delta, 0, cgrp);
 			}
 		}
 	}
@@ -640,64 +638,93 @@ static void sli_proactive_monitor_work(struct work_struct *work)
 	css_put(&event_monitor->cgrp->self);
 }
 
+struct cgroup *get_cgroup_from_task_id(struct task_struct *task, int event_nr)
+{
+	int id;
+	struct cgroup *cgrp;
+
+	id = cpuacct_cgrp_id;
+	switch (event_nr) {
+#if IS_ENABLED(CONFIG_MEMCG)
+	case SLI_MEM_EVENT:
+		id = memory_cgrp_id;
+		break;
+#endif
+	default:
+		break;
+	}
+
+	/* First, try to get cpuacct/mem cgroup for V1*/
+	cgrp = task_cgroup(task, id);
+	if (cgrp && cgrp->level)
+		return cgrp;
+
+	/*
+	 * If can not find cpuacct/mem cgroup or cpuacct/mem cgroup is root, just return
+	 * dfl_cgrp.
+	 */
+	cgrp = task_dfl_cgroup(task);
+
+	return cgrp;
+}
+
 void sli_update_tick(struct task_struct *tsk)
 {
 	struct cgroup *cgrp;
+	int i;
 
 	if (!static_branch_likely(&sli_monitor_enabled))
 		return;
 
 	rcu_read_lock();
 
-	cgrp = get_cgroup_from_task(tsk);
-	if (cgrp && cgroup_parent(cgrp)) {
-		bool ret;
-		int period;
-		unsigned long long old_value, last_update;
+	for (i = 0; i < SLI_EVENT_NR; i++) {
+		cgrp = get_cgroup_from_task_id(tsk, i);
 
-		period = cgrp->cgrp_event_monitor->period;
-		if (!period)
-			goto unlock;
+		if (cgrp && cgroup_parent(cgrp)) {
+			bool ret;
+			int period;
+			unsigned long long old_value, last_update;
 
-retry:
-		last_update = READ_ONCE(cgrp->cgrp_event_monitor->last_update);
-		if (time_after((unsigned long)(period + last_update), jiffies))
-			goto unlock;
+			period = cgrp->cgrp_event_monitor->period;
+			if (!period)
+				continue;
 
-		old_value = cmpxchg(&cgrp->cgrp_event_monitor->last_update,
-				    last_update, jiffies);
-		if (old_value != last_update)
-			goto retry;
+retry:
+			last_update = READ_ONCE(cgrp->cgrp_event_monitor->last_update);
+			if (time_after((unsigned long)(period + last_update), jiffies))
+				continue;
 
-		/*
-		 * Current jiffies should be somewhere between period and 8 * period,
-		 * otherwise we consider the it is overrun and should be abandoned.
-		 */
-		if (time_before((unsigned long)((period << 3) + last_update), jiffies))
-			cgrp->cgrp_event_monitor->overrun = 1;
+			old_value = cmpxchg(&cgrp->cgrp_event_monitor->last_update,
+					last_update, jiffies);
+			if (old_value != last_update)
+				goto retry;
 
-		rcu_read_unlock();
+			/*
+			 * Current jiffies should be somewhere between period and 8 * period,
+			 * otherwise we consider the it is overrun and should be abandoned.
+			 */
+			if (time_before((unsigned long)((period << 3) + last_update), jiffies))
+				 cgrp->cgrp_event_monitor->overrun = 1;
 
-		ret = css_tryget(&cgrp->self);
-		if (!ret)
-			return;
+			ret = css_tryget(&cgrp->self);
+			if (!ret)
+				continue;
 
-		/*
-		 * The sli trace work may have a lot a work to do, and should send
-		 * the event to polling tasks. So we don't do the work in interrupt
-		 * context(put the work to the workqueue).
-		 */
-		ret = queue_work(sli_workqueue, &cgrp->cgrp_event_monitor->sli_event_work);
-		/*
-		 * If work had been pushed to workqueue and not been executed, there is no
-		 * need to push it again. So we must put the css refcount.
-		 */
-		if (!ret)
-			css_put(&cgrp->self);
-		return;
+			/*
+			 * The sli trace work may have a lot a work to do, and should send
+			 * the event to polling tasks. So we don't do the work in interrupt
+			 * context(put the work to the workqueue).
+			 */
+			ret = queue_work(sli_workqueue, &cgrp->cgrp_event_monitor->sli_event_work);
+			/*
+			 * If work had been pushed to workqueue and not been executed, there is no
+			 * need to push it again. So we must put the css refcount.
+			 */
+			if (!ret)
+				css_put(&cgrp->self);
+		}
 	}
-
-unlock:
 	rcu_read_unlock();
 }
 
@@ -1067,6 +1094,75 @@ ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
 	return ret;
 }
 
+int mem_cgroup_sli_control_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long long threshold, count;
+	struct cgroup *cgrp;
+	struct sli_event_monitor *event_monitor;
+
+	cgrp = seq_css(sf)->cgroup;
+	if (cgroup_parent(cgrp))
+		event_monitor = cgrp->cgrp_event_monitor;
+	else
+		event_monitor = &default_sli_event_monitor;
+
+	seq_printf(sf, "period: %d\n", event_monitor->period);
+	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
+
+	for (i = 0; i < MEM_LAT_STAT_NR; i++) {
+		threshold = sli_convert_value(event_monitor->memlat_threshold[i], true);
+		count = sli_convert_value(event_monitor->memlat_count[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_memlat_name(i),
+			   threshold, count);
+	}
+
+	return 0;
+}
+
+int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long long threshold, count;
+	struct cgroup *cgrp;
+	struct sli_event_monitor *event_monitor;
+
+	cgrp = seq_css(sf)->cgroup;
+	if (cgroup_parent(cgrp))
+		event_monitor = cgrp->cgrp_event_monitor;
+	else
+		event_monitor = &default_sli_event_monitor;
+
+	seq_printf(sf, "period: %d\n", event_monitor->period);
+	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
+
+	for (i = 0; i < SCHEDLAT_STAT_NR; i++) {
+		threshold = sli_convert_value(event_monitor->schedlat_threshold[i], true);
+		count = sli_convert_value(event_monitor->schedlat_count[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_schedlat_name(i),
+			   threshold, count);
+	}
+
+	for (i = 0; i < SLI_LONGTERM_NR; i++) {
+		threshold = sli_convert_value(event_monitor->longterm_threshold[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu\n", get_longterm_name(i), threshold);
+	}
+
+	if (!cgroup_parent(cgrp)) {
+		for (i = 0; i < MEM_LAT_STAT_NR; i++) {
+			threshold = sli_convert_value(event_monitor->memlat_threshold[i], true);
+			count = sli_convert_value(event_monitor->memlat_count[i], true);
+
+			seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_memlat_name(i),
+				   threshold, count);
+		}
+	}
+
+	return 0;
+}
 int cgroup_sli_control_show(struct seq_file *sf, void *v)
 {
 	int i;
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 0259cfdeff83..7cbd128a0108 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -443,7 +443,7 @@ static struct cftype files[] = {
 	{
 		.name = "sli.control",
 		.write = cgroup_sli_control_write,
-		.seq_show = cgroup_sli_control_show,
+		.seq_show = cpuacct_cgroup_sli_control_show,
 	},
 	{
 		.name = "sli.monitor",
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ca9a6dedd786..936311cb102c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7714,6 +7714,14 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.release = cgroup_pressure_release,
         },
 #endif
+	{
+		.name = "mbuf",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_mbuf_show,
+		.seq_start = cgroup_mbuf_start,
+		.seq_next = cgroup_mbuf_next,
+		.seq_stop = cgroup_mbuf_stop,
+	},
 #ifdef CONFIG_CGROUP_SLI
 	{
 		.name = "sli",
@@ -7725,6 +7733,22 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = mem_cgroup_sli_max_show,
 	},
+	{
+		.name = "sli.control",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write = cgroup_sli_control_write,
+		.seq_show = mem_cgroup_sli_control_show,
+	},
+	{
+		.name = "sli.monitor",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_sli_monitor_open,
+		.seq_show = cgroup_sli_monitor_show,
+		.seq_start = cgroup_sli_monitor_start,
+		.seq_next = cgroup_sli_monitor_next,
+		.seq_stop = cgroup_sli_monitor_stop,
+		.poll = cgroup_sli_monitor_poll,
+	},
 #endif
 #ifdef CONFIG_EMM_MEMORY_RECLAIM
 	{
-- 
Gitee


From b72e4e7d2081e0b40e7cf13670663cfb16daeb73 Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Wed, 20 Mar 2024 14:40:52 +0800
Subject: [PATCH 03/13] sli: add iolat detection

implment io latency monitoring based on sli and mbuf.

Reviewed-by: Haisu Wang <haisuwang@tencent.com>
Reviewed-by: Bin Lai <robinlai@tencent.com>
Reviewed-by: caelli <caelli@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 block/blk-cgroup.c          |  56 ++++++++
 block/blk-core.c            |  32 +++++
 block/blk-mq.c              |   8 +-
 include/linux/blkdev.h      |   2 +-
 include/linux/cgroup-defs.h |   3 +
 include/linux/sli.h         |  22 +++
 kernel/cgroup/cgroup.c      |  20 ++-
 kernel/cgroup/sli.c         | 280 +++++++++++++++++++++++++++++++++++-
 8 files changed, 416 insertions(+), 7 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 46e955d076b5..57fbae20df5d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1620,6 +1620,26 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 	return 0;
 }
 
+#ifdef CONFIG_CGROUP_SLI
+static int io_cgroup_sli_max_show(struct seq_file *m, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(m));
+	struct cgroup *cgrp;
+	cgrp = blkcg->css.cgroup;
+
+	return sli_iolat_max_show(m, cgrp);
+}
+
+static int io_cgroup_sli_show(struct seq_file *m, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(m));
+	struct cgroup *cgrp;
+	cgrp = blkcg->css.cgroup;
+
+	return sli_iolat_stat_show(m, cgrp);
+}
+#endif
+
 static struct cftype blkcg_files[] = {
 	{
 		.name = "stat",
@@ -1643,6 +1663,42 @@ static struct cftype blkcg_legacy_files[] = {
 		.name = "diskstats_recursive",
 		.seq_show = blkcg_dkstats_recursive_show,
 	},
+	{
+		.name = "mbuf",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_mbuf_show,
+		.seq_start = cgroup_mbuf_start,
+		.seq_next = cgroup_mbuf_next,
+		.seq_stop = cgroup_mbuf_stop,
+	},
+#ifdef CONFIG_CGROUP_SLI
+	{
+		.name = "sli",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = io_cgroup_sli_show,
+	},
+	{
+		.name = "sli_max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = io_cgroup_sli_max_show,
+	},
+	{
+		.name = "sli.control",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write = cgroup_sli_control_write,
+		.seq_show = io_cgroup_sli_control_show,
+	},
+	{
+		.name = "sli.monitor",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_sli_monitor_open,
+		.seq_show = cgroup_sli_monitor_show,
+		.seq_start = cgroup_sli_monitor_start,
+		.seq_next = cgroup_sli_monitor_next,
+		.seq_stop = cgroup_sli_monitor_stop,
+		.poll = cgroup_sli_monitor_poll,
+	},
+#endif
 	{ }	/* terminate */
 };
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 6856b49ed118..df2028efb3d4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -38,6 +38,7 @@
 #include <linux/debugfs.h>
 #include <linux/bpf.h>
 #include <linux/psi.h>
+#include <linux/sli.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -86,6 +87,31 @@ static void blkcg_stat_acct(struct blkcg *blkcg, struct request *req, int new_io
 	}
 }
 
+#ifdef CONFIG_CGROUP_SLI
+static void sli_iolat_stat_end_check(u64 rq_alloc_time_ns, u64 rq_io_start_time_ns,
+		struct bio *bio, struct blkcg *blkcg)
+{
+	struct cgroup *cgrp;
+	u64 sli_iolat_end_time = 0;
+	u64 bio_start = bio_issue_time(&bio->bi_issue);
+
+	if (!bio_start || !rq_alloc_time_ns || !rq_io_start_time_ns || !blkcg ||
+		blkcg == &blkcg_root)
+		return;
+
+	cgrp = blkcg->css.cgroup;
+	if (!cgrp || !cgroup_parent(cgrp))
+		return;
+
+	sli_iolat_end_time = __bio_issue_time(ktime_get_ns());
+	if (sli_iolat_end_time <= bio_start)
+		return;
+
+	sli_iolat_stat_end(IO_LAT_DELAY, bio_start, rq_alloc_time_ns, rq_io_start_time_ns,
+			sli_iolat_end_time, sli_iolat_end_time - bio_start, bio, cgrp);
+}
+#endif
+
 void blkcg_account_io_completion(struct request *req, struct bio *bio,
 					unsigned int bytes)
 {
@@ -95,6 +121,12 @@ void blkcg_account_io_completion(struct request *req, struct bio *bio,
 		struct hd_struct *part;
 		int cpu;
 
+#ifdef CONFIG_CGROUP_SLI
+		if (static_branch_unlikely(&sli_io_enabled))
+			sli_iolat_stat_end_check(req->alloc_time_ns, req->io_start_time_ns,
+					bio, blkcg);
+#endif
+
 		cpu = part_stat_lock();
 		part = req->part;
 		blkcg_part_stat_add(blkcg, cpu, part, sectors[rw], bytes >> 9);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b68fc393d78c..f13cbf9e9161 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
+#include <linux/sli.h>
 
 #include <trace/events/block.h>
 
@@ -326,7 +327,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+#if defined(CONFIG_BLK_RQ_ALLOC_TIME) || defined(CONFIG_CGROUP_SLI)
 	rq->alloc_time_ns = alloc_time_ns;
 #endif
 	if (blk_mq_need_time_stamp(rq))
@@ -368,7 +369,10 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	/* alloc_time includes depth and tag waits */
 	if (blk_queue_rq_alloc_time(q))
 		alloc_time_ns = ktime_get_ns();
-
+#ifdef CONFIG_CGROUP_SLI
+	else if (static_branch_unlikely(&sli_io_enabled))
+		alloc_time_ns = ktime_get_ns();
+#endif
 	data->q = q;
 	if (likely(!data->ctx)) {
 		data->ctx = blk_mq_get_ctx(q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 875f5433b636..24b101a246e9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -203,7 +203,7 @@ struct request {
 
 	struct gendisk *rq_disk;
 	struct hd_struct *part;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+#if defined(CONFIG_BLK_RQ_ALLOC_TIME) || defined(CONFIG_CGROUP_SLI)
 	/* Time that the first bio started allocating this request. */
 	u64 alloc_time_ns;
 #endif
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ca005a23ce3e..ca6fde2c3cfd 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -522,6 +522,9 @@ struct cgroup {
 	/* sched latency stat */
 	struct sli_schedlat_stat __percpu *sli_schedlat_stat_percpu;
 
+	/* io latency stat */
+	struct sli_iolat_stat __percpu *sli_iolat_stat_percpu;
+
 	/* proactive event monitoring structure for cgroup */
 	struct sli_event_monitor *cgrp_event_monitor;
 
diff --git a/include/linux/sli.h b/include/linux/sli.h
index ae8ecd126697..963864f366e5 100755
--- a/include/linux/sli.h
+++ b/include/linux/sli.h
@@ -37,6 +37,11 @@ enum sli_schedlat_stat_item {
 	SCHEDLAT_STAT_NR
 };
 
+enum sli_iolat_stat_item {
+	IO_LAT_DELAY,
+	IO_LAT_STAT_NR
+};
+
 struct sli_memlat_stat {
 	unsigned long latency_max[MEM_LAT_STAT_NR];
 	unsigned long item[MEM_LAT_STAT_NR][LAT_COUNT_NR];
@@ -47,10 +52,16 @@ struct sli_schedlat_stat {
 	unsigned long item[SCHEDLAT_STAT_NR][LAT_COUNT_NR];
 };
 
+struct sli_iolat_stat {
+	unsigned long latency_max[IO_LAT_STAT_NR];
+	unsigned long item[IO_LAT_STAT_NR][LAT_COUNT_NR];
+};
+
 enum sli_event_type {
 	SLI_SCHED_EVENT,
 	SLI_MEM_EVENT,
 	SLI_LONGTERM_EVENT,
+	SLI_IO_EVENT,
 	SLI_EVENT_NR
 };
 
@@ -100,6 +111,10 @@ struct sli_event_monitor {
 	unsigned long long longterm_threshold[SLI_LONGTERM_NR];
 	atomic_long_t longterm_statistics[SLI_LONGTERM_NR];
 
+	unsigned long long iolat_threshold[IO_LAT_STAT_NR];
+	unsigned long long iolat_count[IO_LAT_STAT_NR];
+	atomic_long_t iolat_statistics[IO_LAT_STAT_NR];
+
 	KABI_RESERVE(1);
 	KABI_RESERVE(2);
 };
@@ -129,11 +144,17 @@ void sli_schedlat_stat(struct task_struct *task,enum sli_schedlat_stat_item sidx
 void sli_schedlat_rundelay(struct task_struct *task, struct task_struct *prev, u64 delta);
 int  sli_schedlat_stat_show(struct seq_file *m, struct cgroup *cgrp);
 int  sli_schedlat_max_show(struct seq_file *m, struct cgroup *cgrp);
+void sli_iolat_stat_end(enum sli_iolat_stat_item sidx, u64 bio_start, u64 rq_alloc_time_ns,
+		u64 rq_io_start_time_ns, u64 sli_iolat_end_time, u64 duration, struct bio *bio,
+		struct cgroup *cgrp);
+int sli_iolat_max_show(struct seq_file *m, struct cgroup *cgrp);
+int sli_iolat_stat_show(struct seq_file *m, struct cgroup *cgrp);
 ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off);
 int cgroup_sli_control_show(struct seq_file *sf, void *v);
 int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v);
 int mem_cgroup_sli_control_show(struct seq_file *sf, void *v);
+int io_cgroup_sli_control_show(struct seq_file *sf, void *v);
 void sli_check_longsys(struct task_struct *tsk);
 void sli_update_tick(struct task_struct *tsk);
 
@@ -147,5 +168,6 @@ void sli_monitor_stop(struct seq_file *seq, void *v);
 __poll_t sli_monitor_poll(struct kernfs_open_file *of, poll_table *pt);
 int sli_event_add(struct sli_notify_event *notify_event, u32 event_type, u32 levent, u32 count);
 u32 sli_monitor_signal(struct cgroup *cgrp, struct sli_notify_event *notify_event);
+DECLARE_STATIC_KEY_FALSE(sli_io_enabled);
 
 #endif /*_LINUX_SLI_H*/
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f925f708e248..1b265d120e80 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3937,6 +3937,13 @@ static int cgroup_net_quality_show(struct seq_file *seq, void *v)
 #endif
 
 #ifdef CONFIG_CGROUP_SLI
+static int cgroup_sli_io_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+	return sli_iolat_stat_show(seq, cgroup);
+}
+
 static int cgroup_sli_memory_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
@@ -3957,7 +3964,8 @@ static int cgroup_sli_max_show(struct seq_file *seq, void *v)
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
 
 	sli_schedlat_max_show(seq, cgroup);
-	return sli_memlat_max_show(seq, cgroup);
+	sli_memlat_max_show(seq, cgroup);
+	return sli_iolat_max_show(seq, cgroup);
 }
 #endif
 
@@ -5655,6 +5663,11 @@ static struct cftype cgroup_base_files[] = {
 		.seq_stop = cgroup_mbuf_stop,
 	},
 #ifdef CONFIG_CGROUP_SLI
+	{
+		.name = "sli.io",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_sli_io_show,
+	},
 	{
 		.name = "sli.memory",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -6141,6 +6154,11 @@ static inline bool cgroup_need_mbuf(struct cgroup *cgrp)
 		return true;
 #endif
 
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
+	if (cgroup_css(cgrp, cgroup_subsys[io_cgrp_id]))
+		return true;
+#endif
+
 	return false;
 }
 
diff --git a/kernel/cgroup/sli.c b/kernel/cgroup/sli.c
index 2e383e9180e2..31a0e9d38c6c 100755
--- a/kernel/cgroup/sli.c
+++ b/kernel/cgroup/sli.c
@@ -18,6 +18,7 @@
 #define MAX_STACK_TRACE_DEPTH	64
 
 static DEFINE_STATIC_KEY_FALSE(sli_enabled);
+DEFINE_STATIC_KEY_FALSE(sli_io_enabled);
 static DEFINE_STATIC_KEY_FALSE(sli_monitor_enabled);
 
 static struct sli_event_monitor default_sli_event_monitor;
@@ -59,12 +60,17 @@ static const char *longterm_threshold_name[] = {
 	"longterm_irqtime_threshold="
 };
 
+static const char *iolat_threshold_name[] = {
+	"iolat_delay_threshold="
+};
+
 static const char *sanity_check_abbr[] = {
 	"schedlat_",
 	"memlat_",
 	"longterm_",
 	"period=",
-	"mbuf_enable="
+	"mbuf_enable=",
+	"iolat_"
 };
 
 static void sli_proactive_monitor_work(struct work_struct *work);
@@ -96,6 +102,8 @@ static void sli_event_monitor_init(struct sli_event_monitor *event_monitor, stru
 	memset(&event_monitor->memlat_threshold, 0xff, sizeof(event_monitor->memlat_threshold));
 	memset(&event_monitor->memlat_count, 0xff, sizeof(event_monitor->memlat_count));
 	memset(&event_monitor->longterm_threshold, 0xff, sizeof(event_monitor->longterm_threshold));
+	memset(&event_monitor->iolat_threshold, 0xff, sizeof(event_monitor->iolat_threshold));
+	memset(&event_monitor->iolat_count, 0xff, sizeof(event_monitor->iolat_count));
 
 	event_monitor->last_update = jiffies;
 	event_monitor->cgrp = cgrp;
@@ -153,6 +161,12 @@ static int sli_event_inherit(struct cgroup *cgrp)
 				&cgrp_event_monitor->longterm_statistics[new_event->event_id],
 				sli_get_longterm_statistics(cgrp, new_event->event_id));
 			break;
+		case SLI_IO_EVENT:
+			cgrp_event_monitor->iolat_threshold[new_event->event_id] =
+				READ_ONCE(event_monitor->iolat_threshold[new_event->event_id]);
+			cgrp_event_monitor->iolat_count[new_event->event_id] =
+				READ_ONCE(event_monitor->iolat_count[new_event->event_id]);
+			break;
 		default:
 			printk(KERN_ERR "%s: invalid sli_event type!\n", __func__);
 			goto failed;
@@ -240,6 +254,21 @@ static char * get_memlat_name(enum sli_memlat_stat_item sidx)
 	return name;
 }
 
+static char *get_iolat_name(enum sli_iolat_stat_item sidx)
+{
+	char *name = NULL;
+
+	switch (sidx) {
+	case IO_LAT_DELAY:
+		name = "iolat_delay";
+		break;
+	default:
+		break;
+	}
+
+	return name;
+}
+
 static enum sli_lat_count get_lat_count_idx(u64 duration)
 {
 	enum sli_lat_count idx;
@@ -435,6 +464,106 @@ void sli_memlat_stat_end(enum sli_memlat_stat_item sidx, u64 start)
 	rcu_read_unlock();
 }
 
+static u64 sli_iolat_stat_gather(struct cgroup *cgrp,
+				 enum sli_iolat_stat_item sidx,
+				 enum sli_lat_count cidx)
+{
+	u64 sum = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		sum += per_cpu_ptr(cgrp->sli_iolat_stat_percpu, cpu)->item[sidx][cidx];
+
+	return sum;
+}
+
+int sli_iolat_stat_show(struct seq_file *m, struct cgroup *cgrp)
+{
+	enum sli_iolat_stat_item sidx;
+
+	if (!static_branch_likely(&sli_io_enabled)) {
+		seq_printf(m, "sli_io is not enabled, please echo 1 > /proc/sli/sli_io_enabled\n");
+		return 0;
+	}
+
+	if (!cgrp->sli_iolat_stat_percpu)
+		return 0;
+
+	for (sidx = IO_LAT_DELAY; sidx < IO_LAT_STAT_NR; sidx++) {
+		seq_printf(m, "%s:\n", get_iolat_name(sidx));
+		seq_printf(m, "0-1ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_0_1));
+		seq_printf(m, "1-4ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_1_4));
+		seq_printf(m, "4-8ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_4_8));
+		seq_printf(m, "8-16ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_8_16));
+		seq_printf(m, "16-32ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_16_32));
+		seq_printf(m, "32-64ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_32_64));
+		seq_printf(m, "64-128ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_64_128));
+		seq_printf(m, ">=128ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_128_INF));
+	}
+
+	return 0;
+}
+
+int sli_iolat_max_show(struct seq_file *m, struct cgroup *cgrp)
+{
+	enum sli_iolat_stat_item sidx;
+
+	if (!static_branch_likely(&sli_io_enabled)) {
+		seq_printf(m, "sli_io is not enabled, please echo 1 > /proc/sli/sli_io_enabled\n");
+		return 0;
+	}
+
+	if (!cgrp->sli_iolat_stat_percpu)
+		return 0;
+
+	for (sidx = IO_LAT_DELAY; sidx < IO_LAT_STAT_NR; sidx++) {
+		int cpu;
+		unsigned long latency_sum = 0;
+
+		for_each_possible_cpu(cpu)
+			latency_sum += per_cpu_ptr(cgrp->sli_iolat_stat_percpu, cpu)->latency_max[sidx];
+
+		seq_printf(m, "%s: %lu\n", get_iolat_name(sidx), latency_sum);
+	}
+
+	return 0;
+}
+
+void sli_iolat_stat_end(enum sli_iolat_stat_item sidx, u64 bio_start, u64 rq_alloc_time_ns,
+		u64 rq_io_start_time_ns, u64 sli_iolat_end_time, u64 duration, struct bio *bio,
+		struct cgroup *cgrp)
+{
+	enum sli_lat_count cidx;
+
+	cidx = get_lat_count_idx(duration);
+	duration = duration >> 10;
+	this_cpu_inc(cgrp->sli_iolat_stat_percpu->item[sidx][cidx]);
+	this_cpu_add(cgrp->sli_iolat_stat_percpu->latency_max[sidx], duration);
+
+	if (static_branch_unlikely(&sli_monitor_enabled)) {
+		struct sli_event_monitor *event_monitor = cgrp->cgrp_event_monitor;
+
+		if (duration < READ_ONCE(event_monitor->iolat_threshold[sidx]))
+			return;
+
+		atomic_long_inc(&event_monitor->iolat_statistics[sidx]);
+		if (event_monitor->mbuf_enable) {
+			char *lat_name;
+			unsigned long flags;
+			char b[BDEVNAME_SIZE];
+
+			lat_name = get_iolat_name(sidx);
+			spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags);
+			mbuf_print(cgrp, "record reason:%s devname:%s duration_us=%lld "
+				   "bio_start=%llu req_start=%llu req_issue=%llu "
+				   "bio_complete=%llu\n", lat_name, bio_devname(bio, b),
+				   duration, bio_start, rq_alloc_time_ns,
+				   rq_io_start_time_ns, sli_iolat_end_time);
+			spin_unlock_irqrestore(&cgrp->cgrp_mbuf_lock, flags);
+		}
+	}
+}
+
 void sli_schedlat_stat(struct task_struct *task, enum sli_schedlat_stat_item sidx, u64 delta)
 {
 	struct cgroup *cgrp = NULL;
@@ -625,6 +754,20 @@ static void sli_proactive_monitor_work(struct work_struct *work)
 				sli_event_add(notify_event, event->event_type,
 					      event->event_id, (int)(statistics - last_statistics));
 			break;
+		case SLI_IO_EVENT:
+			statistics = (u64)atomic_long_read(
+					&event_monitor->iolat_statistics[event->event_id]);
+			atomic_long_set(&event_monitor->iolat_statistics[event->event_id], 0);
+
+			if (event_monitor->overrun) {
+				event_monitor->overrun = 0;
+				break;
+			}
+
+			if (statistics >= READ_ONCE(event_monitor->iolat_count[event->event_id]))
+				sli_event_add(notify_event, event->event_type,
+					      event->event_id, statistics);
+			break;
 		default:
 			break;
 		}
@@ -649,6 +792,11 @@ struct cgroup *get_cgroup_from_task_id(struct task_struct *task, int event_nr)
 	case SLI_MEM_EVENT:
 		id = memory_cgrp_id;
 		break;
+#endif
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
+	case SLI_IO_EVENT:
+		id = io_cgrp_id;
+		break;
 #endif
 	default:
 		break;
@@ -705,7 +853,7 @@ void sli_update_tick(struct task_struct *tsk)
 			 * otherwise we consider the it is overrun and should be abandoned.
 			 */
 			if (time_before((unsigned long)((period << 3) + last_update), jiffies))
-				 cgrp->cgrp_event_monitor->overrun = 1;
+				cgrp->cgrp_event_monitor->overrun = 1;
 
 			ret = css_tryget(&cgrp->self);
 			if (!ret)
@@ -933,7 +1081,24 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 
 		sec->mbuf_enable = !!value;
 		break;
+	case 5:
+		for (i = 0; i < ARRAY_SIZE(iolat_threshold_name); i++) {
+			min_len = min(len, (int)strlen((const char *)iolat_threshold_name[i]));
+			if (!strncmp(iolat_threshold_name[i], buf, min_len))
+				break;
+		}
+
+		if (i == ARRAY_SIZE(iolat_threshold_name))
+			return -EINVAL;
+
+		buf += min_len;
+		ret = sli_parse_threshold(buf, sec);
+		if (ret)
+			return ret;
 
+		sec->event_type = SLI_IO_EVENT;
+		sec->event_id = i;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1081,6 +1246,14 @@ ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
 						sli_get_longterm_statistics(cgrp, sec.event_id));
 			ret = sli_event_update(event_monitor, &sec, last_threshold);
 			break;
+		case SLI_IO_EVENT:
+			last_threshold = event_monitor->iolat_threshold[sec.event_id];
+			WRITE_ONCE(event_monitor->iolat_threshold[sec.event_id], sec.threshold);
+			WRITE_ONCE(event_monitor->iolat_count[sec.event_id], sec.count);
+			smp_wmb();
+			atomic_long_set(&event_monitor->iolat_statistics[sec.event_id], 0);
+			ret = sli_event_update(event_monitor, &sec, last_threshold);
+			break;
 		default:
 			break;
 		}
@@ -1094,6 +1267,33 @@ ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
 	return ret;
 }
 
+int io_cgroup_sli_control_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long long threshold, count;
+	struct cgroup *cgrp;
+	struct sli_event_monitor *event_monitor;
+
+	cgrp = seq_css(sf)->cgroup;
+	if (cgroup_parent(cgrp))
+		event_monitor = cgrp->cgrp_event_monitor;
+	else
+		event_monitor = &default_sli_event_monitor;
+
+	seq_printf(sf, "period: %d\n", event_monitor->period);
+	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
+
+	for (i = 0; i < IO_LAT_STAT_NR; i++) {
+		threshold = sli_convert_value(event_monitor->iolat_threshold[i], true);
+		count = sli_convert_value(event_monitor->iolat_count[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i),
+			   threshold, count);
+	}
+
+	return 0;
+}
+
 int mem_cgroup_sli_control_show(struct seq_file *sf, void *v)
 {
 	int i;
@@ -1159,6 +1359,13 @@ int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v)
 			seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_memlat_name(i),
 				   threshold, count);
 		}
+		for (i = 0; i < IO_LAT_STAT_NR; i++) {
+			threshold = sli_convert_value(event_monitor->iolat_threshold[i], true);
+			count = sli_convert_value(event_monitor->iolat_count[i], true);
+
+			seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i),
+				   threshold, count);
+		}
 	}
 
 	return 0;
@@ -1201,6 +1408,13 @@ int cgroup_sli_control_show(struct seq_file *sf, void *v)
 		seq_printf(sf, "%s: threshold: %llu\n", get_longterm_name(i), threshold);
 	}
 
+	for (i = 0; i < IO_LAT_STAT_NR; i++) {
+		threshold = sli_convert_value(event_monitor->iolat_threshold[i], true);
+		count = sli_convert_value(event_monitor->iolat_count[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i),
+			   threshold, count);
+	}
 	return 0;
 }
 
@@ -1451,15 +1665,71 @@ static const struct file_operations sli_enabled_fops = {
 	.release    = single_release,
 };
 
+static int sli_io_enabled_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", static_key_enabled(&sli_io_enabled));
+	return 0;
+}
+
+static int sli_io_enabled_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sli_io_enabled_show, NULL);
+}
+
+static ssize_t sli_io_enabled_write(struct file *file, const char __user *ubuf,
+				    size_t count, loff_t *ppos)
+{
+	char val = -1;
+	int ret = count;
+
+	if (count < 1 || *ppos) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (copy_from_user(&val, ubuf, 1)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	switch (val) {
+	case '0':
+		if (static_key_enabled(&sli_io_enabled))
+			static_branch_disable(&sli_io_enabled);
+		break;
+	case '1':
+		if (!static_key_enabled(&sli_io_enabled))
+			static_branch_enable(&sli_io_enabled);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+out:
+	return ret;
+}
+
+static const struct file_operations sli_io_enabled_fops = {
+	.open       = sli_io_enabled_open,
+	.read       = seq_read,
+	.write      = sli_io_enabled_write,
+	.llseek     = seq_lseek,
+	.release    = single_release,
+};
+
 int sli_cgroup_alloc(struct cgroup *cgroup)
 {
 	if (!cgroup_need_sli(cgroup))
 		return 0;
 
 	spin_lock_init(&cgroup->cgrp_mbuf_lock);
+	cgroup->sli_iolat_stat_percpu = alloc_percpu(struct sli_iolat_stat);
+	if (!cgroup->sli_iolat_stat_percpu)
+		goto out;
+
 	cgroup->sli_memlat_stat_percpu = alloc_percpu(struct sli_memlat_stat);
 	if (!cgroup->sli_memlat_stat_percpu)
-		goto out;
+		goto free_iolat_percpu;
 
 	cgroup->sli_schedlat_stat_percpu = alloc_percpu(struct sli_schedlat_stat);
 	if (!cgroup->sli_schedlat_stat_percpu)
@@ -1481,6 +1751,8 @@ int sli_cgroup_alloc(struct cgroup *cgroup)
 	free_percpu(cgroup->sli_schedlat_stat_percpu);
 free_memlat_percpu:
 	free_percpu(cgroup->sli_memlat_stat_percpu);
+free_iolat_percpu:
+	free_percpu(cgroup->sli_iolat_stat_percpu);
 out:
 	return -ENOMEM;
 }
@@ -1497,6 +1769,7 @@ void sli_cgroup_free(struct cgroup *cgroup)
 	if (!cgroup->cgrp_event_monitor)
 		return;
 
+	free_percpu(cgroup->sli_iolat_stat_percpu);
 	free_percpu(cgroup->sli_memlat_stat_percpu);
 	free_percpu(cgroup->sli_schedlat_stat_percpu);
 	/* Free memory from the event list */
@@ -1518,6 +1791,7 @@ static int __init sli_proc_init(void)
 	}
 	proc_mkdir("sli", NULL);
 	proc_create("sli/sli_enabled", 0, NULL, &sli_enabled_fops);
+	proc_create("sli/sli_io_enabled", 0, NULL, &sli_io_enabled_fops);
 	return 0;
 }
 
-- 
Gitee


From 0bf743254d4db2f5822c68d39c7b06765b1d7b4a Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Tue, 5 Mar 2024 20:30:54 +0800
Subject: [PATCH 04/13] mbuf: attach udesc to files without mbuf

here is a test case, run three shell instances like the following,
the system may crash:

mkdir /sys/fs/cgroup/blkio/testdir
while :
do
  cat /sys/fs/cgroup/blkio/testdir/blkio.mbuf
done

Reviewed-by: mengensun <megnensun@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 block/blk-cgroup.c     |  2 ++
 include/linux/cgroup.h |  2 ++
 kernel/cgroup/cgroup.c | 55 +++++++++++++++++++++++-------------------
 kernel/sched/cpuacct.c |  2 ++
 mm/memcontrol.c        |  2 ++
 5 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 57fbae20df5d..a5802fe6ca99 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1665,11 +1665,13 @@ static struct cftype blkcg_legacy_files[] = {
 	},
 	{
 		.name = "mbuf",
+		.open = cgroup_mbuf_open,
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_mbuf_show,
 		.seq_start = cgroup_mbuf_start,
 		.seq_next = cgroup_mbuf_next,
 		.seq_stop = cgroup_mbuf_stop,
+		.release = cgroup_mbuf_release,
 	},
 #ifdef CONFIG_CGROUP_SLI
 	{
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4b3345b37c04..b881a324a7e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -1015,6 +1015,8 @@ void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos);
 void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos);
 void cgroup_mbuf_stop(struct seq_file *s, void *v);
 int cgroup_mbuf_show(struct seq_file *s, void *v);
+int cgroup_mbuf_open(struct kernfs_open_file *of);
+void cgroup_mbuf_release(struct kernfs_open_file *of);
 
 int cgroup_sli_monitor_open(struct kernfs_open_file *of);
 void *cgroup_sli_monitor_start(struct seq_file *s, loff_t *pos);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1b265d120e80..ec62fc8163e6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3974,31 +3974,27 @@ void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos)
 	struct cgroup *cgrp = seq_css(s)->cgroup;
 	struct mbuf_slot *mb = cgrp->mbuf;
 	u32 index;
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_file_ctx *ctx = of->priv;
+	struct mbuf_user_desc *udesc;
 
 	if (!mb)
 		return NULL;
 
+	udesc = (struct mbuf_user_desc *)ctx->psi.trigger;
 	index = *pos;
 	/* If already reach end, just return */
 	if (index && index == mb->mring->next_idx)
 		return NULL;
 
-	if (!mb->udesc) {
-		mb->udesc = kmalloc(sizeof(struct mbuf_user_desc), GFP_KERNEL);
-
-		if (!mb->udesc)
-			goto out;
-
-		mb->udesc->user_idx = mb->mring->first_idx;
-		mb->udesc->user_seq = mb->mring->first_seq;
-	}
+	udesc->user_idx = mb->mring->first_idx;
+	udesc->user_seq = mb->mring->first_seq;
 
 	/* Maybe reach end or empty */
-	if (mb->udesc->user_idx == mb->mring->next_idx)
+	if (udesc->user_idx == mb->mring->next_idx)
 		return NULL;
 
-out:
-	return mb->udesc;
+	return udesc;
 }
 
 void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos)
@@ -4016,19 +4012,7 @@ void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos)
 	return udesc;
 }
 
-void cgroup_mbuf_stop(struct seq_file *s, void *v)
-{
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct mbuf_user_desc *desc;
-
-	if (cgrp->mbuf) {
-		desc = cgrp->mbuf->udesc;
-		if(desc && desc->user_idx == cgrp->mbuf->mring->next_idx) {
-			kfree(cgrp->mbuf->udesc);
-			cgrp->mbuf->udesc = NULL;
-		}
-	}
-}
+void cgroup_mbuf_stop(struct seq_file *s, void *v) { }
 
 int cgroup_mbuf_show(struct seq_file *s, void *v)
 {
@@ -4046,6 +4030,25 @@ int cgroup_mbuf_show(struct seq_file *s, void *v)
 	return 0;
 }
 
+int cgroup_mbuf_open(struct kernfs_open_file *of)
+{
+	struct cgroup_file_ctx *ctx = of->priv;
+
+	ctx->psi.trigger = (struct mbuf_user_desc *)
+		kzalloc(sizeof(struct mbuf_user_desc), GFP_KERNEL);
+	if (!ctx->psi.trigger)
+		return -ENOMEM;
+	return 0;
+}
+
+void cgroup_mbuf_release(struct kernfs_open_file *of)
+{
+	struct cgroup_file_ctx *ctx = of->priv;
+
+	if (ctx->psi.trigger)
+		kfree((struct mbuf_user_desc *)ctx->psi.trigger);
+}
+
 /*
  * Get cgroup struct from task_struct for mbuf and sli.
  *
@@ -5657,10 +5660,12 @@ static struct cftype cgroup_base_files[] = {
 	{
 		.name = "mbuf",
 		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_mbuf_open,
 		.seq_show = cgroup_mbuf_show,
 		.seq_start = cgroup_mbuf_start,
 		.seq_next = cgroup_mbuf_next,
 		.seq_stop = cgroup_mbuf_stop,
+		.release = cgroup_mbuf_release,
 	},
 #ifdef CONFIG_CGROUP_SLI
 	{
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 7cbd128a0108..6b6f9673cd55 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -414,10 +414,12 @@ static struct cftype files[] = {
 	{
 		.name = "mbuf",
 		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_mbuf_open,
 		.seq_show = cgroup_mbuf_show,
 		.seq_start = cgroup_mbuf_start,
 		.seq_next = cgroup_mbuf_next,
 		.seq_stop = cgroup_mbuf_stop,
+		.release = cgroup_mbuf_release,
 	},
 #ifdef CONFIG_PSI
         {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 936311cb102c..e3cb824319d8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7717,10 +7717,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
 	{
 		.name = "mbuf",
 		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_mbuf_open,
 		.seq_show = cgroup_mbuf_show,
 		.seq_start = cgroup_mbuf_start,
 		.seq_next = cgroup_mbuf_next,
 		.seq_stop = cgroup_mbuf_stop,
+		.release = cgroup_mbuf_release,
 	},
 #ifdef CONFIG_CGROUP_SLI
 	{
-- 
Gitee


From 877240378a8b20120fe9a2b10d84a09127e650b5 Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Mon, 18 Mar 2024 14:48:39 +0800
Subject: [PATCH 05/13] sli: add lock before write sli_io_enabled/sli_enabled

here is a test case, run two shell instances like following,
the system may WARN_ON_ONCE in
kernel/jump_label.c:201 static_key_disable_cpuslocked+0x82/0x90:

while :
do
        echo 1 > /proc/sli/sli_io_enabled
        echo 0 > /proc/sli/sli_io_enabled
done

Reviewed-by: Bin Lai <robinlai@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 kernel/cgroup/sli.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/cgroup/sli.c b/kernel/cgroup/sli.c
index 31a0e9d38c6c..0dc2c673eb6f 100755
--- a/kernel/cgroup/sli.c
+++ b/kernel/cgroup/sli.c
@@ -1640,6 +1640,7 @@ static ssize_t sli_enabled_write(struct file *file, const char __user *ubuf,
 		goto out;
 	}
 
+	inode_lock(file_inode(file));
 	switch (val) {
 	case '0':
 		if (static_key_enabled(&sli_enabled))
@@ -1652,6 +1653,7 @@ static ssize_t sli_enabled_write(struct file *file, const char __user *ubuf,
 	default:
 		ret = -EINVAL;
 	}
+	inode_unlock(file_inode(file));
 
 out:
 	return ret;
@@ -1692,6 +1694,7 @@ static ssize_t sli_io_enabled_write(struct file *file, const char __user *ubuf,
 		goto out;
 	}
 
+	inode_lock(file_inode(file));
 	switch (val) {
 	case '0':
 		if (static_key_enabled(&sli_io_enabled))
@@ -1704,6 +1707,7 @@ static ssize_t sli_io_enabled_write(struct file *file, const char __user *ubuf,
 	default:
 		ret = -EINVAL;
 	}
+	inode_unlock(file_inode(file));
 
 out:
 	return ret;
-- 
Gitee


From 79abd993db28e95ecc63d4842e11093fafde5ba7 Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Tue, 19 Mar 2024 19:52:14 +0800
Subject: [PATCH 06/13] mbuf: move mbuf/sctx_free to css_free_rwork_fn for rcu
 protection

Reviewed-by: Bin Lai <robinlai@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 kernel/cgroup/cgroup.c | 9 ++-------
 kernel/cgroup/sli.c    | 2 ++
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ec62fc8163e6..852927a6717d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5765,6 +5765,8 @@ static void css_free_rwork_fn(struct work_struct *work)
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
 			psi_cgroup_free(cgrp);
+			if (cgrp->mbuf)
+				mbuf_free(cgrp);
 #ifdef CONFIG_CGROUP_SLI
 			sli_cgroup_free(cgrp);
 #endif
@@ -6419,13 +6421,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	cgroup_bpf_offline(cgrp);
 
-	if (cgrp->mbuf)
-		mbuf_free(cgrp);
-
-#ifdef CONFIG_CGROUP_SLI
-	if (cgrp->sctx)
-		sctx_free(cgrp);
-#endif
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
 
diff --git a/kernel/cgroup/sli.c b/kernel/cgroup/sli.c
index 0dc2c673eb6f..26449793d0c3 100755
--- a/kernel/cgroup/sli.c
+++ b/kernel/cgroup/sli.c
@@ -1765,6 +1765,8 @@ void sli_cgroup_free(struct cgroup *cgroup)
 {
 	struct sli_event *event, *event_tmp;
 
+	if (cgroup->sctx)
+		sctx_free(cgroup);
 	/*
 	 * Cgroup's subsys would be cleared before sli_cgroup_free() had been called.
 	 * So we use !cgroup->cgrp_event_monitor instead of cgroup_need_sli to check
-- 
Gitee


From d4c223e899b77682ddf1188bc6b0640a1500b830 Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Wed, 27 Mar 2024 22:19:12 +0800
Subject: [PATCH 07/13] mbuf: fix endless reading

there is a corner case in mbuf_next interface which may
wrap ahead from the beginning of the mring, fix it

btw: there is no lock in read-write side of mbuf, add
fix for it.

Reviewed-by: mengensun <mengensun@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 include/linux/mbuf.h   |  5 ++--
 kernel/cgroup/cgroup.c | 67 ++++++++++++++++++++++++++++++++++--------
 kernel/cgroup/mbuf.c   |  8 ++---
 3 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/include/linux/mbuf.h b/include/linux/mbuf.h
index 32779eee2ef5..6ddb54d26c4b 100644
--- a/include/linux/mbuf.h
+++ b/include/linux/mbuf.h
@@ -47,14 +47,13 @@ struct mbuf_user_desc {
 /* each cgroup has a mbuf_slot struct */
 struct mbuf_slot {
 	u32 idx;
-	/* write op must hold this lock */
-	spinlock_t slot_lock;
+	/* snapshot/write op must hold this lock */
+	seqlock_t slot_lock;
 	/* rate limit */
 	struct ratelimit_state ratelimit;
 	struct cgroup *owner;
 	const struct mbuf_operations *ops;
 	struct mbuf_ring *mring;
-	struct mbuf_user_desc *udesc;
 };
 
 struct mbuf_operations {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 852927a6717d..3e2915a8ffd5 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3971,17 +3971,16 @@ static int cgroup_sli_max_show(struct seq_file *seq, void *v)
 
 void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos)
 {
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct mbuf_slot *mb = cgrp->mbuf;
 	u32 index;
 	struct kernfs_open_file *of = s->private;
 	struct cgroup_file_ctx *ctx = of->priv;
-	struct mbuf_user_desc *udesc;
+	struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist;
+	struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)ctx->psi.trigger;
 
-	if (!mb)
+	/* why: see cgroup_mbuf_open */
+	if (!mb->mring)
 		return NULL;
 
-	udesc = (struct mbuf_user_desc *)ctx->psi.trigger;
 	index = *pos;
 	/* If already reach end, just return */
 	if (index && index == mb->mring->next_idx)
@@ -4000,8 +3999,13 @@ void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos)
 void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct mbuf_slot *mb = cgrp->mbuf;
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_file_ctx *ctx = of->priv;
+	struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist;
+
+	/* why: see cgroup_mbuf_open */
+	if (!mb->mring)
+		return NULL;
 
 	udesc->user_idx = mb->ops->next(mb->mring, udesc->user_idx);
 	*pos = udesc->user_idx;
@@ -4018,8 +4022,13 @@ int cgroup_mbuf_show(struct seq_file *s, void *v)
 {
 	ssize_t ret;
 	struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct mbuf_slot *mb = cgrp->mbuf;
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_file_ctx *ctx = of->priv;
+	struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist;
+
+	/* why: see cgroup_mbuf_open */
+	if (!mb->mring)
+		return 0;
 
 	memset(udesc->buf, 0, sizeof(udesc->buf));
 	ret = mb->ops->read(mb, udesc);
@@ -4030,14 +4039,39 @@ int cgroup_mbuf_show(struct seq_file *s, void *v)
 	return 0;
 }
 
+extern void *snapshot_mbuf(struct mbuf_slot *, struct mbuf_slot*, seqlock_t *);
+extern u32 get_mbuf_slot_len(void);
 int cgroup_mbuf_open(struct kernfs_open_file *of)
 {
 	struct cgroup_file_ctx *ctx = of->priv;
+	struct mbuf_slot *mb = seq_css(of->seq_file)->cgroup->mbuf;
+	u32 mbuf_slot_len;
 
-	ctx->psi.trigger = (struct mbuf_user_desc *)
-		kzalloc(sizeof(struct mbuf_user_desc), GFP_KERNEL);
+	/* use ctx->psi.trigger for mbuf_user_desc */
+	ctx->psi.trigger = kzalloc(sizeof(struct mbuf_user_desc), GFP_KERNEL);
 	if (!ctx->psi.trigger)
 		return -ENOMEM;
+
+	mbuf_slot_len = get_mbuf_slot_len();
+	/* use ctx->procs1.pidlist for mbuf_slot snapshot */
+	ctx->procs1.pidlist = vmalloc(mbuf_slot_len);
+	if (!ctx->procs1.pidlist) {
+		kfree(ctx->psi.trigger);
+		ctx->psi.trigger = NULL;
+		return -ENOMEM;
+	}
+	memset(ctx->procs1.pidlist, 0, mbuf_slot_len);
+
+	/* cgroup may have no mbuf attached, because the mbuf pool
+	 * has a max num
+	 * here we let file open success, so, seq_ops must
+	 * check mring point
+	 */
+	if (!mb)
+		return 0;
+
+	snapshot_mbuf((struct mbuf_slot *)ctx->procs1.pidlist, mb, &mb->slot_lock);
+
 	return 0;
 }
 
@@ -4045,8 +4079,15 @@ void cgroup_mbuf_release(struct kernfs_open_file *of)
 {
 	struct cgroup_file_ctx *ctx = of->priv;
 
-	if (ctx->psi.trigger)
-		kfree((struct mbuf_user_desc *)ctx->psi.trigger);
+	if (ctx->psi.trigger) {
+		kfree(ctx->psi.trigger);
+		ctx->psi.trigger = NULL;
+	}
+
+	if (ctx->procs1.pidlist) {
+		vfree(ctx->procs1.pidlist);
+		ctx->procs1.pidlist = NULL;
+	}
 }
 
 /*
diff --git a/kernel/cgroup/mbuf.c b/kernel/cgroup/mbuf.c
index d52206b0c053..aba0588950f0 100644
--- a/kernel/cgroup/mbuf.c
+++ b/kernel/cgroup/mbuf.c
@@ -269,10 +269,10 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
 	mring = cg->mbuf->mring;
 	len = sizeof(struct mbuf_ring_desc) + size;
 
-	spin_lock_irqsave(&cg->mbuf->slot_lock, flags);
+	write_seqlock_irqsave(&cg->mbuf->slot_lock, flags);
 
 	if (mbuf_prepare(mring, len)){
-		spin_unlock_irqrestore(&cg->mbuf->slot_lock, flags);
+		write_sequnlock_irqrestore(&cg->mbuf->slot_lock, flags);
 		pr_err("mbuf: Can not find enough space.\n");
 		return 0;
 	}
@@ -291,7 +291,7 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
 	mring->next_idx += desc->len;
 	mring->next_seq++;
 
-	spin_unlock_irqrestore(&cg->mbuf->slot_lock, flags);
+	write_sequnlock_irqrestore(&cg->mbuf->slot_lock, flags);
 
 	return size;
 }
@@ -341,7 +341,7 @@ static void mbuf_slot_init(struct mbuf_slot *mb, struct cgroup *cg, u32 index)
 	mb->owner = cg;
 	mb->idx = index;
 	mb->ops = &mbuf_ops;
-	spin_lock_init(&mb->slot_lock);
+	seqlock_init(&mb->slot_lock);
 	ratelimit_state_init(&mb->ratelimit, 5 * HZ,50);
 
 	mb->mring = (struct mbuf_ring *)((char *)mb + sizeof(struct mbuf_slot));
-- 
Gitee


From 3b4add89fb15bc10bbc18a31b0fb5abadad637bc Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Tue, 7 May 2024 20:00:26 +0800
Subject: [PATCH 08/13] mbuf: use CONFIG_MBUF for cgroup mbuf

Signed-off-by: yilingjin <yilingjin@tencent.com>
Reviewed-by: mengensun <mengensun@tencent.com>
---
 block/blk-cgroup.c     | 2 ++
 kernel/cgroup/cgroup.c | 2 +-
 kernel/sched/cpuacct.c | 2 ++
 mm/memcontrol.c        | 2 ++
 4 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a5802fe6ca99..47fd6e0b6835 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1663,6 +1663,7 @@ static struct cftype blkcg_legacy_files[] = {
 		.name = "diskstats_recursive",
 		.seq_show = blkcg_dkstats_recursive_show,
 	},
+#ifdef CONFIG_RQM
 	{
 		.name = "mbuf",
 		.open = cgroup_mbuf_open,
@@ -1673,6 +1674,7 @@ static struct cftype blkcg_legacy_files[] = {
 		.seq_stop = cgroup_mbuf_stop,
 		.release = cgroup_mbuf_release,
 	},
+#endif
 #ifdef CONFIG_CGROUP_SLI
 	{
 		.name = "sli",
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3e2915a8ffd5..95374a804c16 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5697,7 +5697,6 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_net_quality_show,
 		.release = cgroup_net_release,
 	},
-#endif
 	{
 		.name = "mbuf",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -5708,6 +5707,7 @@ static struct cftype cgroup_base_files[] = {
 		.seq_stop = cgroup_mbuf_stop,
 		.release = cgroup_mbuf_release,
 	},
+#endif
 #ifdef CONFIG_CGROUP_SLI
 	{
 		.name = "sli.io",
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 6b6f9673cd55..4f49ec1eb14f 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -411,6 +411,7 @@ static struct cftype files[] = {
 		.name = "uptime",
 		.seq_show = cpuacct_uptime_show,
 	},
+#ifdef CONFIG_RQM
 	{
 		.name = "mbuf",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -421,6 +422,7 @@ static struct cftype files[] = {
 		.seq_stop = cgroup_mbuf_stop,
 		.release = cgroup_mbuf_release,
 	},
+#endif
 #ifdef CONFIG_PSI
         {
                 .name = "cpu.pressure",
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e3cb824319d8..a68a4323358b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7714,6 +7714,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.release = cgroup_pressure_release,
         },
 #endif
+#ifdef CONFIG_RQM
 	{
 		.name = "mbuf",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -7724,6 +7725,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.seq_stop = cgroup_mbuf_stop,
 		.release = cgroup_mbuf_release,
 	},
+#endif
 #ifdef CONFIG_CGROUP_SLI
 	{
 		.name = "sli",
-- 
Gitee


From 3b626e9188f4e92b90b518a72277cd2da9b14954 Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Wed, 29 May 2024 11:37:26 +0800
Subject: [PATCH 09/13] sli: fix non-mutual exclusion between show and write of
 control

Reviewed-by: Liu Chun <kaicliu@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 kernel/cgroup/sli.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/cgroup/sli.c b/kernel/cgroup/sli.c
index 26449793d0c3..7c77fac8c753 100755
--- a/kernel/cgroup/sli.c
+++ b/kernel/cgroup/sli.c
@@ -1280,6 +1280,7 @@ int io_cgroup_sli_control_show(struct seq_file *sf, void *v)
 	else
 		event_monitor = &default_sli_event_monitor;
 
+	inode_lock_shared(file_inode(sf->file));
 	seq_printf(sf, "period: %d\n", event_monitor->period);
 	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
 
@@ -1291,6 +1292,7 @@ int io_cgroup_sli_control_show(struct seq_file *sf, void *v)
 			   threshold, count);
 	}
 
+	inode_unlock_shared(file_inode(sf->file));
 	return 0;
 }
 
@@ -1307,6 +1309,7 @@ int mem_cgroup_sli_control_show(struct seq_file *sf, void *v)
 	else
 		event_monitor = &default_sli_event_monitor;
 
+	inode_lock_shared(file_inode(sf->file));
 	seq_printf(sf, "period: %d\n", event_monitor->period);
 	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
 
@@ -1318,6 +1321,7 @@ int mem_cgroup_sli_control_show(struct seq_file *sf, void *v)
 			   threshold, count);
 	}
 
+	inode_unlock_shared(file_inode(sf->file));
 	return 0;
 }
 
@@ -1334,6 +1338,7 @@ int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v)
 	else
 		event_monitor = &default_sli_event_monitor;
 
+	inode_lock_shared(file_inode(sf->file));
 	seq_printf(sf, "period: %d\n", event_monitor->period);
 	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
 
@@ -1368,6 +1373,7 @@ int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v)
 		}
 	}
 
+	inode_unlock_shared(file_inode(sf->file));
 	return 0;
 }
 int cgroup_sli_control_show(struct seq_file *sf, void *v)
@@ -1383,6 +1389,7 @@ int cgroup_sli_control_show(struct seq_file *sf, void *v)
 	else
 		event_monitor = &default_sli_event_monitor;
 
+	inode_lock_shared(file_inode(sf->file));
 	seq_printf(sf, "period: %d\n", event_monitor->period);
 	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
 
@@ -1415,6 +1422,7 @@ int cgroup_sli_control_show(struct seq_file *sf, void *v)
 		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i),
 			   threshold, count);
 	}
+	inode_unlock_shared(file_inode(sf->file));
 	return 0;
 }
 
-- 
Gitee


From 1990c82aa2d59c8c3aaec8233373e105d21966cc Mon Sep 17 00:00:00 2001
From: caelli <caelli@tencent.com>
Date: Wed, 11 Jun 2025 20:11:29 +0800
Subject: [PATCH 10/13] netlat: backport tk5 feature

backport tk5 netlat code to generate a non kabi version

Reviewed-by: yuehongwu<yuehongwu@tencent.com>
Signed-off-by: caelli <caelli@tencent.com>
---
 include/linux/mbuf.h        |  23 +++---
 include/linux/proc_fs.h     |   9 +++
 include/net/net_namespace.h |   6 ++
 include/net/netns_mbuf.h    |  29 ++++++++
 include/net/tcp.h           |   1 +
 kernel/cgroup/cgroup.c      |   5 +-
 kernel/cgroup/mbuf.c        |  95 +++++++++++++++----------
 net/core/netns_mbuf.c       | 135 ++++++++++--------------------------
 net/ipv4/netlat.c           |  74 +++++++++-----------
 net/ipv4/netlat.h           |  51 ++++++++++++++
 10 files changed, 239 insertions(+), 189 deletions(-)
 create mode 100644 include/net/netns_mbuf.h
 create mode 100644 net/ipv4/netlat.h

diff --git a/include/linux/mbuf.h b/include/linux/mbuf.h
index 6ddb54d26c4b..28d4346080ca 100644
--- a/include/linux/mbuf.h
+++ b/include/linux/mbuf.h
@@ -1,14 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2021 bauerchen <bauerchen@tencent.com>
+ * Copyright (C) 2024 mengensun <mengensun@tencent.com>
  */
 
 #ifndef _CGROUP_MBUF_H
 #define _CGROUP_MBUF_H
 
-#include <linux/cgroup.h>
-#include <linux/rwsem.h>
-
 struct mbuf_struct {
 	u32 mbuf_len;
 	u32 mbuf_max_slots;
@@ -51,23 +48,33 @@ struct mbuf_slot {
 	seqlock_t slot_lock;
 	/* rate limit */
 	struct ratelimit_state ratelimit;
-	struct cgroup *owner;
+	void *owner;
 	const struct mbuf_operations *ops;
 	struct mbuf_ring *mring;
 };
 
 struct mbuf_operations {
 	/* read message */
-	ssize_t (*read) (struct mbuf_slot *, struct mbuf_user_desc *);
+	ssize_t (*read)(struct mbuf_slot *_slot, struct mbuf_user_desc *udest);
+
 	/* get next available idx */
-	u32 (*next)	(struct mbuf_ring *, u32);
+	u32 (*next)(struct mbuf_ring *mring, u32 idx);
+
 	/* write message */
-	ssize_t (*write) (struct cgroup *, const char *, va_list);
+	ssize_t (*write)(struct mbuf_slot *mbuf, const char *fmt, va_list args);
 } ____cacheline_aligned;
 
 
 void __init mbuf_bmap_init(void);
 void __init setup_mbuf(void);
+
 struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg);
+struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops);
 void mbuf_free(struct cgroup *cg);
+
+ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...);
+void snapshot_mbuf(struct mbuf_slot *, struct mbuf_slot*, seqlock_t *);
+u32 get_mbuf_slot_len(void);
+void mbuf_free_slot(struct mbuf_slot *slot);
+void mbuf_reset(struct mbuf_slot *mbuf);
 #endif
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d22c1c7fa774..8b9cb125a10e 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -81,6 +81,15 @@ extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 extern struct net init_net;
 extern struct list_head sysctl_restrict_list;
 
+#ifdef CONFIG_NETNS_MBUF
+void *seq_open_net_large_private(struct inode *inode, struct file *file);
+struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode,
+                                              struct proc_dir_entry *parent,
+                                              const struct seq_operations *seq_ops,
+                                              unsigned int state_size, void *data,
+                                              const struct file_operations *proc_ops);
+#endif
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 539972f4325c..de8123fea233 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -38,6 +38,9 @@
 #include <linux/ns_common.h>
 #include <linux/idr.h>
 #include <linux/skbuff.h>
+#ifdef CONFIG_NETNS_MBUF
+#include <net/netns_mbuf.h>
+#endif
 
 struct user_namespace;
 struct proc_dir_entry;
@@ -190,6 +193,9 @@ struct net {
 	struct sock		*crypto_nlsk;
 #endif
 	struct sock		*diag_nlsk;
+#ifdef CONFIG_NETNS_MBUF
+	struct net_mbuf		mbuf;
+#endif
 } __randomize_layout;
 
 #include <linux/seq_file_net.h>
diff --git a/include/net/netns_mbuf.h b/include/net/netns_mbuf.h
new file mode 100644
index 000000000000..6a272949c4a4
--- /dev/null
+++ b/include/net/netns_mbuf.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * make mbuf can be used by net namespace
+ *
+ * Author: mengensun <mengensun@tencent.com>
+ * Copyright (C) 2024 Tencent, Inc
+ */
+#ifndef __NETNS_MBUF
+#define __NETNS_MBUF
+
+#include<linux/proc_fs.h>
+#include<linux/mbuf.h>
+
+#ifdef CONFIG_NETNS_MBUF
+struct net_mbuf {
+	struct proc_dir_entry	*twatcher;
+	struct proc_dir_entry	*log;
+	struct mbuf_slot	*slot;
+};
+
+int inet_mbuf_init(void);
+void inet_mbuf_exit(void);
+ssize_t net_mbuf_print(struct net *net, const char *fmt, ...);
+#else
+static __always_inline int inet_mbuf_init(void) {return 0; }
+static __always_inline void inet_mbuf_exit(void) {}
+static __always_inline ssize_t net_mbuf_print(struct net *net, const char *fmt, ...) {return 0; };
+#endif
+#endif
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d7660fda7bd4..63cbb6af9d2d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -879,6 +879,7 @@ struct tcp_skb_cb {
 			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
 			unused:5;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
+	__u32           first_xmit_time;
 	union {
 		struct {
 			/* There is space for up to 24 bytes */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 95374a804c16..f3bb7c91f8d1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4039,7 +4039,6 @@ int cgroup_mbuf_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-extern void *snapshot_mbuf(struct mbuf_slot *, struct mbuf_slot*, seqlock_t *);
 extern u32 get_mbuf_slot_len(void);
 int cgroup_mbuf_open(struct kernfs_open_file *of)
 {
@@ -4134,7 +4133,7 @@ ssize_t mbuf_print_task(struct task_struct *task, const char *fmt, ...)
 
 	if (mb->ops) {
 		va_start(args, fmt);
-		mb->ops->write(cgrp, fmt, args);
+		mb->ops->write(mb, fmt, args);
 		va_end(args);
 	}
 
@@ -4158,7 +4157,7 @@ ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...)
 
 	if (mb->ops) {
 		va_start(args, fmt);
-		mb->ops->write(cgrp, fmt, args);
+		mb->ops->write(mb, fmt, args);
 		va_end(args);
 	}
 
diff --git a/kernel/cgroup/mbuf.c b/kernel/cgroup/mbuf.c
index aba0588950f0..3ed303b15124 100644
--- a/kernel/cgroup/mbuf.c
+++ b/kernel/cgroup/mbuf.c
@@ -1,11 +1,11 @@
-//  SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  Quality Monitor Buffer
  *  Aim to provide backup buffer for RQM to record critical message.
  *  Could be used to catch critical context when abnormal jitters occur.
  *
- *	Author: bauerchen <bauerchen@tencent.com>
- *	Copyright (C) 2021 Tencent, Inc
+ *	Author: mengensun <mengensun@tencent.com>
+ *	Copyright (C) 2024 Tencent, Inc
  */
 
 #include <linux/kernel.h>
@@ -57,7 +57,7 @@ static void __init mbuf_len_update(u64 size)
 				(u64)MBUF_LEN_MAX);
 	}
 
-	if (size < MBUF_LEN_MIN){
+	if (size < MBUF_LEN_MIN) {
 		size = (u64) MBUF_LEN_MIN;
 		pr_warn("mbuf: monitor buffer less [ %llu ] is not supported.\n",
 				(u64) MBUF_LEN_MIN);
@@ -118,7 +118,7 @@ void __init mbuf_bmap_init(void)
 					   L1_CACHE_BYTES);
 	mbuf_bitmap = kmalloc(alloc_size, __GFP_HIGH|__GFP_ZERO);
 
-	if(!mbuf_bitmap){
+	if (!mbuf_bitmap) {
 		pr_err("mbuf: alloc mbuf_bitmap failed!\n");
 		return;
 	}
@@ -165,7 +165,7 @@ static u32 mbuf_next(struct mbuf_ring *mring, u32 curr_idx)
 	 * just goto head
 	 */
 	frees = mring->end_idx - next_idx;
-	if(frees < sizeof(struct mbuf_ring_desc)){
+	if (frees < sizeof(struct mbuf_ring_desc)) {
 		next_idx = mring->base_idx;
 		goto next;
 	}
@@ -230,9 +230,8 @@ static int mbuf_prepare(struct mbuf_ring *mring, u32 msg_size)
 {
 	u32 frees;
 
-	if (unlikely(msg_size > MBUF_MSG_LEN_MAX)) {
+	if (unlikely(msg_size > MBUF_MSG_LEN_MAX))
 		return -ENOMEM;
-	}
 
 	while (mring->first_seq < mring->next_seq) {
 
@@ -253,26 +252,26 @@ static int mbuf_prepare(struct mbuf_ring *mring, u32 msg_size)
 }
 
 /* Write monitor buffer message */
-static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
+static ssize_t do_mbuf_write(struct mbuf_slot *mbuf, char *buffer, size_t size)
 {
 	struct mbuf_ring *mring;
 	struct mbuf_ring_desc *desc;
 	size_t len;
 	unsigned long flags;
 
-	if (size >= g_mbuf.mbuf_size_per_cg){
+	if (size >= g_mbuf.mbuf_size_per_cg) {
 		pr_err("mbuf: write message need less than [ %u ] bytes\n",
 				g_mbuf.mbuf_size_per_cg);
 		return 0;
 	}
 
-	mring = cg->mbuf->mring;
+	mring = mbuf->mring;
 	len = sizeof(struct mbuf_ring_desc) + size;
 
-	write_seqlock_irqsave(&cg->mbuf->slot_lock, flags);
+	write_seqlock_irqsave(&mbuf->slot_lock, flags);
 
-	if (mbuf_prepare(mring, len)){
-		write_sequnlock_irqrestore(&cg->mbuf->slot_lock, flags);
+	if (mbuf_prepare(mring, len)) {
+		write_sequnlock_irqrestore(&mbuf->slot_lock, flags);
 		pr_err("mbuf: Can not find enough space.\n");
 		return 0;
 	}
@@ -291,20 +290,25 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
 	mring->next_idx += desc->len;
 	mring->next_seq++;
 
-	write_sequnlock_irqrestore(&cg->mbuf->slot_lock, flags);
+	write_sequnlock_irqrestore(&mbuf->slot_lock, flags);
 
 	return size;
 }
 
-void mbuf_reset(struct mbuf_ring *mring)
+void mbuf_reset(struct mbuf_slot *mbuf)
 {
-	mring->first_idx = mring->base_idx;
-	mring->first_seq = 0;
-	mring->next_idx = mring->base_idx;
-	mring->next_seq = 0;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&mbuf->slot_lock, flags);
+	mbuf->mring->first_idx = mbuf->mring->base_idx;
+	mbuf->mring->first_seq = 0;
+	mbuf->mring->next_idx = mbuf->mring->base_idx;
+	mbuf->mring->next_seq = 0;
+	write_sequnlock_irqrestore(&mbuf->slot_lock, flags);
 }
+EXPORT_SYMBOL(mbuf_reset);
 
-static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args)
+static ssize_t mbuf_write(struct mbuf_slot *mbuf, const char *fmt, va_list args)
 {
 	static char buf[MBUF_MSG_LEN_MAX];
 	char *text = buf;
@@ -314,7 +318,7 @@ static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args)
 	t_len = vscnprintf(text, sizeof(buf), fmt, args);
 
 	/* Write string to mbuf */
-	ret = do_mbuf_write(cg, text, t_len);
+	ret = do_mbuf_write(mbuf, text, t_len);
 
 	return ret;
 }
@@ -336,23 +340,30 @@ static int get_next_mbuf_id(unsigned long *addr, u32 start)
 	return index;
 }
 
-static void mbuf_slot_init(struct mbuf_slot *mb, struct cgroup *cg, u32 index)
+static void mbuf_slot_init(struct mbuf_slot *mb,
+			   void *owner, u32 index, struct mbuf_operations *ops)
 {
-	mb->owner = cg;
+	mb->owner = owner;
 	mb->idx = index;
-	mb->ops = &mbuf_ops;
+
+	if (!ops)
+		mb->ops = &mbuf_ops;
+	else
+		mb->ops = ops;
+
 	seqlock_init(&mb->slot_lock);
-	ratelimit_state_init(&mb->ratelimit, 5 * HZ,50);
+	ratelimit_state_init(&mb->ratelimit, 5 * HZ, 50);
 
 	mb->mring = (struct mbuf_ring *)((char *)mb + sizeof(struct mbuf_slot));
-	mb->mring->base_idx = index *
-				g_mbuf.mbuf_size_per_cg + sizeof(struct mbuf_slot) + sizeof(struct mbuf_ring);
+	mb->mring->base_idx = index * g_mbuf.mbuf_size_per_cg
+				+ sizeof(struct mbuf_slot)
+				+ sizeof(struct mbuf_ring);
 	mb->mring->end_idx = (index + 1) * g_mbuf.mbuf_size_per_cg - 1;
 
-	mbuf_reset(mb->mring);
+	mbuf_reset(mb);
 }
 
-struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg)
+struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops)
 {
 	struct mbuf_slot *mb;
 	u32 index = 0;
@@ -401,26 +412,38 @@ struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg)
 	g_mbuf.mbuf_next_id = index;
 
 	mb = (struct mbuf_slot *)(g_mbuf.mbuf + index * g_mbuf.mbuf_size_per_cg);
-	mbuf_slot_init(mb, cg, index);
+	mbuf_slot_init(mb, owner, index, ops);
 	g_mbuf.mbuf_frees--;
 
 	spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags);
 
 	return mb;
 }
+EXPORT_SYMBOL(mbuf_slot_alloc_v2);
 
-void mbuf_free(struct cgroup *cg)
+struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg)
+{
+	return mbuf_slot_alloc_v2((void *)cg, NULL);
+}
+EXPORT_SYMBOL(mbuf_slot_alloc);
+
+void mbuf_free_slot(struct mbuf_slot *slot)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&g_mbuf.mbuf_lock, flags);
-
 	/* Make current idx the next available buffer */
-	g_mbuf.mbuf_next_id = cg->mbuf->idx;
+	g_mbuf.mbuf_next_id = slot->idx;
 	__clear_bit(g_mbuf.mbuf_next_id, g_mbuf.mbuf_bitmap);
-
 	g_mbuf.mbuf_frees++;
 	spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags);
+
+}
+EXPORT_SYMBOL(mbuf_free_slot);
+
+void mbuf_free(struct cgroup *cg)
+{
+	mbuf_free_slot(cg->mbuf);
 }
 
 static u32 rd_mbuf_next(struct mbuf_ring *mring, u32 curr_idx)
@@ -493,7 +516,7 @@ static ssize_t rd_mbuf_read(struct mbuf_slot *mb, struct mbuf_user_desc *udesc)
  * called snapshot_mbuf copy data from mbuf to the `dst`. then read
  * the dst use the following ops
  *
- * all the index is offset from the the end point of mring of the
+ * all the index is offset from the end point of mring of the
  * snapshot, instead of from the global mbuf memory pool
  *
  * btw: the private data of seq file is the ideal place to hold the
diff --git a/net/core/netns_mbuf.c b/net/core/netns_mbuf.c
index 08bc7eb6135d..52f8b626307e 100644
--- a/net/core/netns_mbuf.c
+++ b/net/core/netns_mbuf.c
@@ -5,49 +5,22 @@
  * Author: yuehongwu <yuehongwu@tencent.com>
  * Copyright (C) 2024 Tencent, Inc
  */
-
+#include<linux/cgroup.h>
 #include<linux/mbuf.h>
 #include<linux/proc_fs.h>
 
 #include<net/net_namespace.h>
 #include<net/netns/generic.h>
 
-struct net_mbuf_data {
-	struct proc_dir_entry *twatcher;
-	struct proc_dir_entry *log;
-	/* used between write side and read side, read side can
-	 * be delayed, read side can't be delayed
-	 */
-	seqlock_t lock;
-	/* this ugly, while i have no idea how to deal the kabi
-	 * because the kabi of mbuf have a cgroup parameter
-	 * !!this is not really a cgroup, just a struct cgroup,
-	 * make kabi and mbuf happy
-	 *
-	 * when next kabi version is planned, here can:
-	 * 1. add a pointer type of struct mbuf_slot, to struct
-	 * net
-	 * 2. and change the mbuf kabi using the mbuf_slot as the
-	 * parameter insteading using cgroup
-	 * 3. delete those code using net_generic just using the
-	 * pointer in struct net directly
-	 */
-	unsigned char cgroup[0];
-};
-
 struct mbuf_seq_data {
 	struct seq_net_private snp;
 	struct mbuf_user_desc udesc;
-	struct mbuf_slot snapshot[0];
+	struct mbuf_slot snapshot[];
 };
 
-unsigned int net_mbuf_id __read_mostly;
-static struct mbuf_slot *get_net_mbuf(struct net *net)
+static inline struct mbuf_slot *get_net_mbuf(struct net *net)
 {
-	struct net_mbuf_data *pdata;
-
-	pdata = net_generic(net, net_mbuf_id);
-	return ((struct cgroup *)pdata->cgroup)->mbuf;
+	return net->mbuf.slot;
 }
 
 /* not controlled by sysctl_qos_mbuf_enable because we will
@@ -55,29 +28,16 @@ static struct mbuf_slot *get_net_mbuf(struct net *net)
  */
 ssize_t net_mbuf_print(struct net *net, const char *fmt, ...)
 {
-	struct mbuf_slot *mb;
-	struct cgroup *cg;
 	va_list args;
-	struct net_mbuf_data *pdata;
-
-	pdata = net_generic(net, net_mbuf_id);
-
-	cg = (struct cgroup *)(pdata->cgroup);
-	mb = cg->mbuf;
+	struct mbuf_slot *slot;
 
-	if (!mb)
-		goto out;
-
-	if (!__ratelimit(&mb->ratelimit))
+	slot = net->mbuf.slot;
+	if (!slot || !__ratelimit(&slot->ratelimit))
 		goto out;
 
-	if (mb->ops) {
-		va_start(args, fmt);
-		write_seqlock(&pdata->lock);
-		mb->ops->write(cg, fmt, args);
-		write_sequnlock(&pdata->lock);
-		va_end(args);
-	}
+	va_start(args, fmt);
+	slot->ops->write(slot, fmt, args);
+	va_end(args);
 out:
 	return 0;
 }
@@ -158,11 +118,8 @@ static int netns_mbuf_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-extern void *seq_open_net_large_private(struct inode *, struct file*);
-extern void *snapshot_mbuf(struct mbuf_slot *dst, struct mbuf_slot *src, seqlock_t *mbuf_lock);
 static int seq_mbuf_open(struct inode *inode, struct file *file)
 {
-	struct net_mbuf_data *pd;
 	struct mbuf_seq_data *p;
 	struct mbuf_slot *mbuf;
 
@@ -183,8 +140,7 @@ static int seq_mbuf_open(struct inode *inode, struct file *file)
 	if (!mbuf)
 		return 0;
 
-	pd = net_generic(p->snp.net, net_mbuf_id);
-	snapshot_mbuf(p->snapshot, mbuf, &pd->lock);
+	snapshot_mbuf(p->snapshot, mbuf, &mbuf->slot_lock);
 	return 0;
 }
 
@@ -204,28 +160,20 @@ static int seq_mbuf_release(struct inode *ino, struct file *f)
 
 /* when write clear the data */
 ssize_t seq_mbuf_write(struct file *f, const char __user *ubuf,
-		size_t size, loff_t *_pos)
+		       size_t size, loff_t *_pos)
 {
 	struct seq_file *seq = f->private_data;
 	struct mbuf_seq_data *p;
-	struct net_mbuf_data *pd;
 	struct mbuf_slot *mb;
 
 	p = seq->private;
-	pd = net_generic(p->snp.net, net_mbuf_id);
 	mb = get_net_mbuf(p->snp.net);
 
 	/* the netns not attached mbuf */
 	if (!mb)
 		return size;
 
-	/* reset the mbuf to clear all the data */
-	write_seqlock(&pd->lock);
-	mb->mring->first_idx = mb->mring->base_idx;
-	mb->mring->first_seq = 0;
-	mb->mring->next_idx = mb->mring->base_idx;
-	mb->mring->next_seq = 0;
-	write_sequnlock(&pd->lock);
+	mbuf_reset(mb);
 	return size;
 }
 
@@ -254,19 +202,14 @@ static const struct file_operations mbuf_seq_fops = {
 };
 
 extern struct proc_dir_entry *proc_create_net_data_ops(const char *name,
-		umode_t mode, struct proc_dir_entry *parent,
-		const struct seq_operations *seq_ops,
-		unsigned int state_size, void *data,
-		const struct file_operations *fops);
-extern u32 get_mbuf_slot_len(void);
+                umode_t mode, struct proc_dir_entry *parent,
+                const struct seq_operations *seq_ops,
+                unsigned int state_size, void *data,
+                const struct file_operations *fops);
+
 static int __net_init net_mbuf_init(struct net *net)
 {
-	int ret;
-	struct net_mbuf_data *p;
-	struct mbuf_slot *mbuf;
-
-	ret = 0;
-	p = net_generic(net, net_mbuf_id);
+	int ret = 0;
 
 	/* if mbuf alloc failed, make the netns create success
 	 *
@@ -279,23 +222,22 @@ static int __net_init net_mbuf_init(struct net *net)
 	 * `speak nothing`
 	 * cgroup is used for kabi
 	 */
-	seqlock_init(&p->lock);
-	mbuf = mbuf_slot_alloc((struct cgroup *)(p->cgroup));
-	if (!mbuf)
+	net->mbuf.slot = mbuf_slot_alloc_v2((void *)net, NULL);
+	if (!net->mbuf.slot)
 		pr_err("fail alloc mbuf");
-	((struct cgroup *)p->cgroup)->mbuf = mbuf;
 
-	p->twatcher = proc_net_mkdir(net, "twatcher", net->proc_net);
-	if (!p->twatcher) {
+	net->mbuf.twatcher = proc_net_mkdir(net, "twatcher", net->proc_net);
+	if (!net->mbuf.twatcher) {
 		ret = -ENOMEM;
 		goto free_mbuf;
 	}
 
-	p->log = proc_create_net_data_ops("log", S_IFREG | 0644, p->twatcher,
-			&mbuf_seq_ops,
-			sizeof(struct mbuf_seq_data) + get_mbuf_slot_len(),
-			NULL, &mbuf_seq_fops);
-	if (!p->log) {
+	net->mbuf.log = proc_create_net_data_ops("log", S_IFREG | 0644,
+						 net->mbuf.twatcher,
+						 &mbuf_seq_ops,
+						 sizeof(struct mbuf_seq_data) + get_mbuf_slot_len(),
+						 NULL, &mbuf_seq_fops);
+	if (!net->mbuf.log) {
 		ret = -ENOMEM;
 		goto remove_watcher;
 	}
@@ -305,32 +247,25 @@ static int __net_init net_mbuf_init(struct net *net)
 	remove_proc_entry("twatcher", net->proc_net);
 
 free_mbuf:
-	if (mbuf)
-		mbuf_free((struct cgroup *)(p->cgroup));
+	if (net->mbuf.slot)
+		mbuf_free_slot(net->mbuf.slot);
 	return ret;
 }
 
 static void __net_exit net_mbuf_exit(struct net *net)
 {
-	struct net_mbuf_data *pdata;
-
-	pdata = net_generic(net, net_mbuf_id);
-
-	remove_proc_entry("log", pdata->twatcher);
+	remove_proc_entry("log", net->mbuf.twatcher);
 	remove_proc_entry("twatcher", net->proc_net);
 
-	/* if mbuf allocte failed, no need to free*/
-	if (!((struct cgroup *)pdata->cgroup)->mbuf)
+	/* if mbuf allocate failed, no need to free */
+	if (!net->mbuf.slot)
 		return;
-	mbuf_free((struct cgroup *)(pdata->cgroup));
+	mbuf_free_slot(net->mbuf.slot);
 }
 
 static struct pernet_operations net_mbuf_ops = {
 	.init = net_mbuf_init,
 	.exit = net_mbuf_exit,
-	.id   = &net_mbuf_id,
-	/* for kabi */
-	.size = sizeof(struct net_mbuf_data) + sizeof(struct cgroup),
 };
 
 int  inet_mbuf_init(void)
diff --git a/net/ipv4/netlat.c b/net/ipv4/netlat.c
index e256f4e4fe81..3984e8510968 100644
--- a/net/ipv4/netlat.c
+++ b/net/ipv4/netlat.c
@@ -10,7 +10,8 @@
 #include<net/net_namespace.h>
 #include<net/tcp.h>
 #include<net/netns/generic.h>
-extern int net_mbuf_print(struct net *net, const char *fmt, ...);
+#include<net/netns_mbuf.h>
+#include "netlat.h"
 
 struct netlat_net_data {
 	int ack;
@@ -56,39 +57,24 @@ static inline long *get_net_ports(struct net *net)
 	return pdata->ports;
 }
 
-/* this function is only can be used with skb on rtx queue
- * because the skb on rtx queue is never be transmit down
- * so the ack_seq is not used for all the skb on trx queue
- * if we add a field in skb, the kapi is changed, we need a
- * delt time from `skb enqueue to rtx queue` to `skb dequeue
- * from rtx queue`, because all the current field about
- * timestamp is reflesh when skb is restransmitted, we can
- * not use thoese field, we borrow the ack_seq to record the
- * time when skb enqueue to rtx queue.
- *
- * !! in next version allow change the kabi, please add a
- * field in skb, and change the follow thress function to
- * using the new added field.
- * borrow the ack_seq is so trick!!
- */
 static inline u32 get_rtxq_skb_jiffies(struct sk_buff *skb)
 {
-	return TCP_SKB_CB(skb)->ack_seq;
+	return TCP_SKB_CB(skb)->first_xmit_time;
 }
 
 static inline void set_rtxq_skb_jiffies(struct sk_buff *skb)
 {
-	TCP_SKB_CB(skb)->ack_seq = tcp_jiffies32;
+	TCP_SKB_CB(skb)->first_xmit_time = tcp_jiffies32;
 }
 
 /* sk is not used for now, but, may be used in the future
  */
 void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst,
-				struct sk_buff *src)
+			  struct sk_buff *src)
 {
 	if (!static_branch_unlikely(&enable_netlat))
 		return;
-	TCP_SKB_CB(dst)->ack_seq = TCP_SKB_CB(src)->ack_seq;
+	TCP_SKB_CB(dst)->first_xmit_time = TCP_SKB_CB(src)->first_xmit_time;
 }
 EXPORT_SYMBOL(netlat_copy_rtxq_skb);
 
@@ -100,10 +86,10 @@ static inline u32 tcp_jiffies32_delt(struct sk_buff *skb)
 	j2 = get_rtxq_skb_jiffies(skb);
 
 	/* here leave a small time windows
-	* when skb is alloced ack_num is inited to 0
-	* if we do not touch the time stamp in ack_num
-	* it is zero
-	*/
+	 * when skb is alloced ack_num is inited to 0
+	 * if we do not touch the time stamp in ack_num
+	 * it is zero
+	 */
 	if (!j2)
 		return 0;
 
@@ -151,9 +137,9 @@ void netlat_ack_check(struct sock *sk, struct sk_buff *skb)
 		return;
 
 	net_mbuf_print(net, "TCP AC %u %pI4 %d %pI4 %d\n",
-			(unsigned int)(jiffies_to_msecs(lat)),
-			&sk->sk_rcv_saddr, (int)sk->sk_num,
-			&sk->sk_daddr, (int)ntohs(sk->sk_dport));
+		       (unsigned int)(jiffies_to_msecs(lat)),
+		       &sk->sk_rcv_saddr, (int)sk->sk_num,
+		       &sk->sk_daddr, (int)ntohs(sk->sk_dport));
 }
 EXPORT_SYMBOL(netlat_ack_check);
 
@@ -187,7 +173,7 @@ static void handle_net_timestamp(bool closed)
 	if (closed) {
 		need_time_stamp--;
 		if (need_time_stamp == 0 &&
-				static_branch_unlikely(&enable_netlat))
+		    static_branch_unlikely(&enable_netlat))
 			net_disable_timestamp();
 		return;
 	}
@@ -195,7 +181,7 @@ static void handle_net_timestamp(bool closed)
 	/*0->!0*/
 	need_time_stamp++;
 	if (need_time_stamp == 1 &&
-			static_branch_unlikely(&enable_netlat))
+	    static_branch_unlikely(&enable_netlat))
 		net_enable_timestamp();
 }
 
@@ -224,8 +210,7 @@ static void handle_net_timestamp_exit(bool queue, bool pick)
 
 	if (!static_branch_unlikely(&enable_netlat))
 		return;
-	/*
-	 * if we dec the counter to zero and netlat enabled
+	/* if we dec the counter to zero and netlat enabled
 	 * disable the timestamp
 	 */
 	if (!need_time_stamp && (queue || pick))
@@ -233,7 +218,7 @@ static void handle_net_timestamp_exit(bool queue, bool pick)
 }
 
 static int proc_do_netlat_pick(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp, loff_t *ppos)
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int prev;
 	int ret;
@@ -280,8 +265,8 @@ static int proc_do_netlat_queue(struct ctl_table *table, int write,
 }
 
 static int proc_do_netlat_enable(struct ctl_table *table, int write,
-					void __user *buffer,
-					size_t *lenp, loff_t *ppos)
+				 void __user *buffer,
+				 size_t *lenp, loff_t *ppos)
 {
 	int prev;
 	int ret;
@@ -427,7 +412,7 @@ static void netlat_exit_ipv4_ctl_table(struct net *net)
 /* print msg to per net mbuf when latency from
  * netif to queued on tcp receive queue
  */
-void netlat_queue_check(struct sock *sk, struct sk_buff *skb)
+void netlat_queue_check(struct sock *sk, struct sk_buff *skb, int flags)
 {
 	struct net *net;
 	s64 lat;
@@ -456,11 +441,16 @@ void netlat_queue_check(struct sock *sk, struct sk_buff *skb)
 	lat = lat < 0 ? 0 : lat;
 	if (lat < thresh)
 		return;
-
-	net_mbuf_print(net, "TCP QU %u %pI4 %d %pI4 %d\n",
-			(unsigned int)lat,
-			&sk->sk_rcv_saddr, (int)sk->sk_num,
-			&sk->sk_daddr, (int)ntohs(sk->sk_dport));
+	if (flags & QUEUE_FLAG_RCV)
+		net_mbuf_print(net, "TCP QU %u %pI4 %d %pI4 %d\n",
+			       (unsigned int)lat,
+			       &sk->sk_rcv_saddr, (int)sk->sk_num,
+			       &sk->sk_daddr, (int)ntohs(sk->sk_dport));
+	else /* QUEUE_FLAG_OFO for now */
+		net_mbuf_print(net, "TCP OO %u %pI4 %d %pI4 %d\n",
+			       (unsigned int)lat,
+			       &sk->sk_rcv_saddr, (int)sk->sk_num,
+			       &sk->sk_daddr, (int)ntohs(sk->sk_dport));
 }
 EXPORT_SYMBOL(netlat_queue_check);
 
@@ -498,8 +488,8 @@ void netlat_pick_check(struct sock *sk, struct sk_buff *skb)
 		return;
 
 	net_mbuf_print(net, "TCP PI %u %pI4 %d %pI4 %d\n",
-			(unsigned int)lat, &sk->sk_rcv_saddr, (int)sk->sk_num,
-			&sk->sk_daddr, (int)ntohs(sk->sk_dport));
+		       (unsigned int)lat, &sk->sk_rcv_saddr, (int)sk->sk_num,
+		       &sk->sk_daddr, (int)ntohs(sk->sk_dport));
 }
 EXPORT_SYMBOL(netlat_pick_check);
 
diff --git a/net/ipv4/netlat.h b/net/ipv4/netlat.h
new file mode 100644
index 000000000000..b0a59c5585a0
--- /dev/null
+++ b/net/ipv4/netlat.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * add a netlat to monitor tcp -package latency
+ *
+ * Author: mengensun <mengensun@tencent.com>
+ * Copyright (C) 2024 Tencent, Inc
+ */
+
+#ifndef H______NETLAT
+#define H______NETLAT
+
+#ifdef CONFIG_NETLAT
+
+#define QUEUE_FLAG_OFO 0x1
+#define QUEUE_FLAG_RCV 0x2
+
+int netlat_net_init(void);
+void netlat_net_exit(void);
+void netlat_ack_check(struct sock *sk, struct sk_buff *skb);
+void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src);
+void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb);
+#define netlat_check(oldest, sk, skb) \
+do { \
+	if (oldest) { \
+		netlat_ack_check(sk, skb); \
+		oldest = false; \
+	} \
+} while (0)
+
+void netlat_queue_check(struct sock *sk, struct sk_buff *skb, int flags);
+void netlat_pick_check(struct sock *sk, struct sk_buff *skb);
+
+#else /* CONFIG_NETLAT */
+static __always_inline int netlat_net_init(void) { return 0; };
+static __always_inline void netlat_net_exit(void) { };
+static __always_inline void netlat_ack_check(struct sock *sk,
+					     struct sk_buff *skb) { };
+static __always_inline void netlat_copy_rtxq_skb(struct sock *sk,
+						 struct sk_buff *dst,
+						 struct sk_buff *src) { };
+static __always_inline void netlat_tcp_enrtxqueue(struct sock *sk,
+						  struct sk_buff *skb) { };
+#define netlat_check(oldest, sk, skb)
+
+#define QUEUE_FLAG_OFO 0x1
+#define QUEUE_FLAG_RCV 0x2
+#define netlat_queue_check(sk, skb, flags)
+
+#define netlat_pick_check(sk, skb)
+#endif /* !CONFIG_NETLAT */
+#endif
-- 
Gitee


From c6ae2f336f0e51e4f53ade82b10ee70f13e9fd49 Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Thu, 21 Aug 2025 15:17:48 +0800
Subject: [PATCH 11/13] mbuf: bugfix: move mbuf_free to cgroup_destroy_locked
 and set NULL after free

Now some services started by systemd will mkdir and rmdir sub-cgs of memory,
but will not release them completely, so they will not go to css_free_rwork_fn
to call mbuf_free, resulting in invalid mbuf space occupation.

After mbuf_free releases mbuf_slot, cg->mbuf is not set to NULL, resulting in
the incompletely released cg still being able to use the slot. However, the slot
has been marked as available and can be allocated to another cg, resulting in data confusion.

Reviewed-by: caelli <caelli@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 kernel/cgroup/cgroup.c |  5 +++--
 kernel/cgroup/mbuf.c   | 10 +++++++++-
 mm/memcontrol.c        |  7 ++++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f3bb7c91f8d1..9ec853feba30 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5805,8 +5805,6 @@ static void css_free_rwork_fn(struct work_struct *work)
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
 			psi_cgroup_free(cgrp);
-			if (cgrp->mbuf)
-				mbuf_free(cgrp);
 #ifdef CONFIG_CGROUP_SLI
 			sli_cgroup_free(cgrp);
 #endif
@@ -6461,6 +6459,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	cgroup_bpf_offline(cgrp);
 
+	if (cgrp->mbuf)
+		mbuf_free(cgrp);
+
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
 
diff --git a/kernel/cgroup/mbuf.c b/kernel/cgroup/mbuf.c
index 3ed303b15124..78f81312d455 100644
--- a/kernel/cgroup/mbuf.c
+++ b/kernel/cgroup/mbuf.c
@@ -443,7 +443,15 @@ EXPORT_SYMBOL(mbuf_free_slot);
 
 void mbuf_free(struct cgroup *cg)
 {
-	mbuf_free_slot(cg->mbuf);
+	unsigned long flags;
+	struct mbuf_slot *slot;
+
+	spin_lock_irqsave(&cg->cgrp_mbuf_lock, flags);
+	slot = cg->mbuf;
+	cg->mbuf = NULL;
+	spin_unlock_irqrestore(&cg->cgrp_mbuf_lock, flags);
+
+	mbuf_free_slot(slot);
 }
 
 static u32 rd_mbuf_next(struct mbuf_ring *mring, u32 curr_idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a68a4323358b..d5c39baf26a9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1617,6 +1617,8 @@ static void priority_kill_process(struct task_struct *victim,
 	struct task_struct *p;
 	struct mm_struct *mm;
 	struct mem_cgroup *memcg;
+	unsigned long flags;
+	struct cgroup *cgrp;
 
 	p = find_lock_task_mm(victim);
 	if (!p) {
@@ -1637,8 +1639,11 @@ static void priority_kill_process(struct task_struct *victim,
 
 	/* Now we select [ victim ] to kill, just record it to mbuf */
 	memcg = mem_cgroup_from_task(victim);
-	mbuf_print(memcg->css.cgroup, "memqos: Killing process [ %s ] pid [ %d ] for memory reclaim",
+	cgrp = memcg->css.cgroup;
+	spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags);
+	mbuf_print(cgrp, "memqos: Killing process [ %s ] pid [ %d ] for memory reclaim",
                victim->comm, victim->pid);
+	spin_unlock_irqrestore(&cgrp->cgrp_mbuf_lock, flags);
 
 	/* Get a reference to safely compare mm after task_unlock(victim) */
 	mm = victim->mm;
-- 
Gitee


From f5e10f35598fde6e0d49b51dc57e7f605232ee2a Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Fri, 22 Aug 2025 15:30:29 +0800
Subject: [PATCH 12/13] net/mbuf: add kernel.qos_mbuf_enable before mbuf_alloc

Executing "ip netns add ns0" to create a new netns will consumes
a mbuf_slot, need to use sysctl kernel.qos_mbuf_enable to limit.

But, when the IPv4 protocol stack is initialized, the root netns
needs a mbuf_slot, so ensure mbuf_slot can be allocated
for the first time.

Reviewed-by: Yuehong Wu <yuehongwu@tencent.com>
Reviewed-by: caelli <caelli@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 net/core/netns_mbuf.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/net/core/netns_mbuf.c b/net/core/netns_mbuf.c
index 52f8b626307e..81dee51f5c85 100644
--- a/net/core/netns_mbuf.c
+++ b/net/core/netns_mbuf.c
@@ -12,6 +12,7 @@
 #include<net/net_namespace.h>
 #include<net/netns/generic.h>
 
+extern int sysctl_qos_mbuf_enable;
 struct mbuf_seq_data {
 	struct seq_net_private snp;
 	struct mbuf_user_desc udesc;
@@ -221,10 +222,17 @@ static int __net_init net_mbuf_init(struct net *net)
 	 * is make usr interface not changed, and make netlat
 	 * `speak nothing`
 	 * cgroup is used for kabi
+	 *
+	 * When the IPv4 protocol stack is initialized, the root netns
+	 * (init_net) needs a mbuf_slot, be sure that the
+	 * mbuf_slot can be allocated for the first time.
 	 */
-	net->mbuf.slot = mbuf_slot_alloc_v2((void *)net, NULL);
-	if (!net->mbuf.slot)
-		pr_err("fail alloc mbuf");
+	if (sysctl_qos_mbuf_enable || net == &init_net) {
+		net->mbuf.slot = mbuf_slot_alloc_v2((void *)net, NULL);
+		if (!net->mbuf.slot)
+			pr_err("netns: fail alloc mbuf");
+	} else
+		net->mbuf.slot = NULL;
 
 	net->mbuf.twatcher = proc_net_mkdir(net, "twatcher", net->proc_net);
 	if (!net->mbuf.twatcher) {
-- 
Gitee


From 4ae89b27429b4db549792f1222d93d85bbf2b4b4 Mon Sep 17 00:00:00 2001
From: yilingjin <yilingjin@tencent.com>
Date: Wed, 2 Jul 2025 20:30:07 +0800
Subject: [PATCH 13/13] sli: bugfix remove control count for longterm_rundelay
 and longterm_irqtime

Reviewed-by: caelli <caelli@tencent.com>
Signed-off-by: yilingjin <yilingjin@tencent.com>
---
 include/linux/cgroup.h |  1 -
 include/linux/mbuf.h   |  2 ++
 kernel/cgroup/sli.c    | 35 +++++++++++++++++++++++------------
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b881a324a7e0..fea0909b2e44 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -1009,7 +1009,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 ssize_t cgroup_priority(struct cgroup_subsys_state *css);
 
 struct cgroup *get_cgroup_from_task(struct task_struct *task);
-ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...);
 ssize_t mbuf_print_task(struct task_struct *task, const char *fmt, ...);
 void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos);
 void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos);
diff --git a/include/linux/mbuf.h b/include/linux/mbuf.h
index 28d4346080ca..7f3705923f86 100644
--- a/include/linux/mbuf.h
+++ b/include/linux/mbuf.h
@@ -6,6 +6,8 @@
 #ifndef _CGROUP_MBUF_H
 #define _CGROUP_MBUF_H
 
+#include <linux/cgroup-defs.h>
+
 struct mbuf_struct {
 	u32 mbuf_len;
 	u32 mbuf_max_slots;
diff --git a/kernel/cgroup/sli.c b/kernel/cgroup/sli.c
index 7c77fac8c753..71e9a58b6c0c 100755
--- a/kernel/cgroup/sli.c
+++ b/kernel/cgroup/sli.c
@@ -955,7 +955,8 @@ static unsigned long sli_get_longterm_statistics(struct cgroup *cgrp,
 	return latency_sum;
 }
 
-static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec)
+static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec,
+		int index)
 {
 	char *str;
 	int i, len, ret;
@@ -963,15 +964,21 @@ static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec)
 
 	/* Replace the delimiter with '\0' */
 	len = strlen(buf);
-	for (i = 0; i < len; i++) {
-		if (buf[i] == ',' || buf[i] == ' ') {
-			buf[i] = '\0';
-			break;
+	if (len == 0)
+		return -EINVAL;
+
+	/* longterm_rundelay/irqtime dont need check */
+	if (index != 2) {
+		for (i = 0; i < len; i++) {
+			if (buf[i] == ',' || buf[i] == ' ') {
+				buf[i] = '\0';
+				break;
+			}
 		}
-	}
 
-	if (i == len)
-		return -EINVAL;
+		if (i == len)
+			return -EINVAL;
+	}
 
 	/* Parse the value for theshold */
 	ret = kstrtou64(buf, 0, &value);
@@ -980,6 +987,10 @@ static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec)
 
 	sec->threshold = sli_convert_value(value, false);
 
+	/* longterm_rundelay/irqtime dont need count= param*/
+	if (index == 2)
+		return 0;
+
 	/* Move the pointer to the positon which after the delimiter */
 	buf += (i + 1);
 	len -= (i + 1);
@@ -1017,7 +1028,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 			return -EINVAL;
 
 		buf += min_len;
-		ret = sli_parse_threshold(buf, sec);
+		ret = sli_parse_threshold(buf, sec, index);
 		if (ret)
 			return ret;
 
@@ -1035,7 +1046,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 			return -EINVAL;
 
 		buf += min_len;
-		ret = sli_parse_threshold(buf, sec);
+		ret = sli_parse_threshold(buf, sec, index);
 		if (ret)
 			return ret;
 
@@ -1053,7 +1064,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 			return -EINVAL;
 
 		buf += min_len;
-		ret = sli_parse_threshold(buf, sec);
+		ret = sli_parse_threshold(buf, sec, index);
 		if (ret)
 			return ret;
 
@@ -1092,7 +1103,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 			return -EINVAL;
 
 		buf += min_len;
-		ret = sli_parse_threshold(buf, sec);
+		ret = sli_parse_threshold(buf, sec, index);
 		if (ret)
 			return ret;
 
-- 
Gitee