diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 46e955d076b5b4a03133b2e66b9ec428ba08c349..47fd6e0b6835949af3e3f298df2f7061bd2127f9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1620,6 +1620,26 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 	return 0;
 }
 
+#ifdef CONFIG_CGROUP_SLI
+static int io_cgroup_sli_max_show(struct seq_file *m, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(m));
+	struct cgroup *cgrp;
+	cgrp = blkcg->css.cgroup;
+
+	return sli_iolat_max_show(m, cgrp);
+}
+
+static int io_cgroup_sli_show(struct seq_file *m, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(m));
+	struct cgroup *cgrp;
+	cgrp = blkcg->css.cgroup;
+
+	return sli_iolat_stat_show(m, cgrp);
+}
+#endif
+
 static struct cftype blkcg_files[] = {
 	{
 		.name = "stat",
@@ -1643,6 +1663,46 @@ static struct cftype blkcg_legacy_files[] = {
 		.name = "diskstats_recursive",
 		.seq_show = blkcg_dkstats_recursive_show,
 	},
+#ifdef CONFIG_RQM
+	{
+		.name = "mbuf",
+		.open = cgroup_mbuf_open,
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_mbuf_show,
+		.seq_start = cgroup_mbuf_start,
+		.seq_next = cgroup_mbuf_next,
+		.seq_stop = cgroup_mbuf_stop,
+		.release = cgroup_mbuf_release,
+	},
+#endif
+#ifdef CONFIG_CGROUP_SLI
+	{
+		.name = "sli",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = io_cgroup_sli_show,
+	},
+	{
+		.name = "sli_max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = io_cgroup_sli_max_show,
+	},
+	{
+		.name = "sli.control",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write = cgroup_sli_control_write,
+		.seq_show = io_cgroup_sli_control_show,
+	},
+	{
+		.name = "sli.monitor",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_sli_monitor_open,
+		.seq_show = cgroup_sli_monitor_show,
+		.seq_start = cgroup_sli_monitor_start,
+		.seq_next = cgroup_sli_monitor_next,
+		.seq_stop = cgroup_sli_monitor_stop,
+		.poll = cgroup_sli_monitor_poll,
+	},
+#endif
 	{ }	/* terminate */
 };
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 6856b49ed1186a44232e7b021a0d2cb62b941a8d..df2028efb3d40b4e935385ff5915e0a19641cf83 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -38,6 +38,7 @@
 #include <linux/debugfs.h>
 #include <linux/bpf.h>
 #include <linux/psi.h>
+#include <linux/sli.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -86,6 +87,31 @@ static void blkcg_stat_acct(struct blkcg *blkcg, struct request *req, int new_io
 	}
 }
 
+#ifdef CONFIG_CGROUP_SLI
+static void sli_iolat_stat_end_check(u64 rq_alloc_time_ns, u64 rq_io_start_time_ns,
+		struct bio *bio, struct blkcg *blkcg)
+{
+	struct cgroup *cgrp;
+	u64 sli_iolat_end_time = 0;
+	u64 bio_start = bio_issue_time(&bio->bi_issue);
+
+	if (!bio_start || !rq_alloc_time_ns || !rq_io_start_time_ns || !blkcg ||
+		blkcg == &blkcg_root)
+		return;
+
+	cgrp = blkcg->css.cgroup;
+	if (!cgrp || !cgroup_parent(cgrp))
+		return;
+
+	sli_iolat_end_time = __bio_issue_time(ktime_get_ns());
+	if (sli_iolat_end_time <= bio_start)
+		return;
+
+	sli_iolat_stat_end(IO_LAT_DELAY, bio_start, rq_alloc_time_ns, rq_io_start_time_ns,
+			sli_iolat_end_time, sli_iolat_end_time - bio_start, bio, cgrp);
+}
+#endif
+
 void blkcg_account_io_completion(struct request *req, struct bio *bio,
 					unsigned int bytes)
 {
@@ -95,6 +121,12 @@ void blkcg_account_io_completion(struct request *req, struct bio *bio,
 		struct hd_struct *part;
 		int cpu;
 
+#ifdef CONFIG_CGROUP_SLI
+		if (static_branch_unlikely(&sli_io_enabled))
+			sli_iolat_stat_end_check(req->alloc_time_ns, req->io_start_time_ns,
+					bio, blkcg);
+#endif
+
 		cpu = part_stat_lock();
 		part = req->part;
 		blkcg_part_stat_add(blkcg, cpu, part, sectors[rw], bytes >> 9);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b68fc393d78c9a34ac5038248cdd55f6d9a551bd..f13cbf9e916145bfcecc51d3a97486c881398551 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
+#include <linux/sli.h>
 
 #include <trace/events/block.h>
 
@@ -326,7 +327,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+#if defined(CONFIG_BLK_RQ_ALLOC_TIME) || defined(CONFIG_CGROUP_SLI)
 	rq->alloc_time_ns = alloc_time_ns;
 #endif
 	if (blk_mq_need_time_stamp(rq))
@@ -368,7 +369,10 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	/* alloc_time includes depth and tag waits */
 	if (blk_queue_rq_alloc_time(q))
 		alloc_time_ns = ktime_get_ns();
-
+#ifdef CONFIG_CGROUP_SLI
+	else if (static_branch_unlikely(&sli_io_enabled))
+		alloc_time_ns = ktime_get_ns();
+#endif
 	data->q = q;
 	if (likely(!data->ctx)) {
 		data->ctx = blk_mq_get_ctx(q);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index be90c3cf67d0f8aed02237e149f7b5f882b7a7b1..f89b93f62fd14a80d7816173de028a73c406b93b 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -65,6 +65,61 @@ static int seq_open_net(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#ifdef CONFIG_RQM
+/* token from seq_open_net, all is same except the private is
+ * alloc by vmalloc, why?
+ *
+ * sameone may need a big private, wasting continuous phy mem
+ * they can use this function to use vmalloc private
+ *
+ * from now if you using this open abi place write a write
+ * fops like proc_simple_write we delete the pde->write check
+ */
+void *seq_open_net_large_private(struct inode *inode, struct file *file)
+{
+	struct net *net;
+	struct seq_file *seq;
+	struct seq_net_private *p;
+	int ret;
+	unsigned int state_size = PDE(inode)->state_size;
+
+	WARN_ON_ONCE(state_size < sizeof(struct seq_net_private));
+
+	net = get_proc_net(inode);
+	if (!net) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	p = vmalloc(state_size);
+	if (!p) {
+		ret = -ENOMEM;
+		goto put_out;
+	}
+	memset(p, 0, state_size);
+
+	ret = seq_open(file, PDE(inode)->seq_ops);
+	if (ret < 0)
+		goto free_out;
+
+	seq = file->private_data;
+	seq->private = (void *)p;
+
+#ifdef CONFIG_NET_NS
+	p->net = net;
+#endif
+	return p;
+
+free_out:
+	vfree(p);
+put_out:
+	put_net(net);
+out:
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(seq_open_net_large_private);
+#endif
+
 static int seq_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq = f->private_data;
@@ -118,6 +173,30 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(proc_create_net_data);
 
+#ifdef CONFIG_RQM
+/* add a ext-abi to allow someone define the fops by themself, this is all
+ * alike proc_create_net_data except has a extra f_ops parameter
+ */
+struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, const struct seq_operations *seq_ops,
+		unsigned int state_size, void *data,
+		const struct file_operations *f_ops)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create_reg(name, mode, &parent, data);
+	if (!p)
+		return NULL;
+
+	pde_force_lookup(p);
+	p->proc_fops = f_ops;
+	p->seq_ops = seq_ops;
+	p->state_size = state_size;
+	return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_data_ops);
+#endif
+
 /**
  * proc_create_net_data_write - Create a writable net_ns-specific proc file
  * @name: The name of the file.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 875f5433b6369a26f5a11c79b702dc735c49c85b..24b101a246e962c00408d67fdc0fa99182233204 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -203,7 +203,7 @@ struct request {
 
 	struct gendisk *rq_disk;
 	struct hd_struct *part;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+#if defined(CONFIG_BLK_RQ_ALLOC_TIME) || defined(CONFIG_CGROUP_SLI)
 	/* Time that the first bio started allocating this request. */
 	u64 alloc_time_ns;
 #endif
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ca005a23ce3ec2fa4bf0551243f7165d7f40602c..ca6fde2c3cfdf4d7ceb0eaa41d01c9a5830582e3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -522,6 +522,9 @@ struct cgroup {
 	/* sched latency stat */
 	struct sli_schedlat_stat __percpu *sli_schedlat_stat_percpu;
 
+	/* io latency stat */
+	struct sli_iolat_stat __percpu *sli_iolat_stat_percpu;
+
 	/* proactive event monitoring structure for cgroup */
 	struct sli_event_monitor *cgrp_event_monitor;
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4b3345b37c0452f71be2a4eeb7dbcfb6ac4a46fe..fea0909b2e443ef9362d521f61c5d3bb9ed8e01e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -1009,12 +1009,13 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 ssize_t cgroup_priority(struct cgroup_subsys_state *css);
 
 struct cgroup *get_cgroup_from_task(struct task_struct *task);
-ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...);
 ssize_t mbuf_print_task(struct task_struct *task, const char *fmt, ...);
 void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos);
 void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos);
 void cgroup_mbuf_stop(struct seq_file *s, void *v);
 int cgroup_mbuf_show(struct seq_file *s, void *v);
+int cgroup_mbuf_open(struct kernfs_open_file *of);
+void cgroup_mbuf_release(struct kernfs_open_file *of);
 
 int cgroup_sli_monitor_open(struct kernfs_open_file *of);
 void *cgroup_sli_monitor_start(struct seq_file *s, loff_t *pos);
diff --git a/include/linux/mbuf.h b/include/linux/mbuf.h
index 32779eee2ef569bdd7fc8fd8ffa58e402bb5ddad..7f3705923f86e981bb14db98c789db000b1f941d 100644
--- a/include/linux/mbuf.h
+++ b/include/linux/mbuf.h
@@ -1,13 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2021 bauerchen <bauerchen@tencent.com>
+ * Copyright (C) 2024 mengensun <mengensun@tencent.com>
  */
 
 #ifndef _CGROUP_MBUF_H
 #define _CGROUP_MBUF_H
 
-#include <linux/cgroup.h>
-#include <linux/rwsem.h>
+#include <linux/cgroup-defs.h>
 
 struct mbuf_struct {
 	u32 mbuf_len;
@@ -47,28 +46,37 @@ struct mbuf_user_desc {
 /* each cgroup has a mbuf_slot struct */
 struct mbuf_slot {
 	u32 idx;
-	/* write op must hold this lock */
-	spinlock_t slot_lock;
+	/* snapshot/write op must hold this lock */
+	seqlock_t slot_lock;
 	/* rate limit */
 	struct ratelimit_state ratelimit;
-	struct cgroup *owner;
+	void *owner;
 	const struct mbuf_operations *ops;
 	struct mbuf_ring *mring;
-	struct mbuf_user_desc *udesc;
 };
 
 struct mbuf_operations {
 	/* read message */
-	ssize_t (*read) (struct mbuf_slot *, struct mbuf_user_desc *);
+	ssize_t (*read)(struct mbuf_slot *_slot, struct mbuf_user_desc *udest);
+
 	/* get next available idx */
-	u32 (*next)	(struct mbuf_ring *, u32);
+	u32 (*next)(struct mbuf_ring *mring, u32 idx);
+
 	/* write message */
-	ssize_t (*write) (struct cgroup *, const char *, va_list);
+	ssize_t (*write)(struct mbuf_slot *mbuf, const char *fmt, va_list args);
 } ____cacheline_aligned;
 
 
 void __init mbuf_bmap_init(void);
 void __init setup_mbuf(void);
+
 struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg);
+struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops);
 void mbuf_free(struct cgroup *cg);
+
+ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...);
+void snapshot_mbuf(struct mbuf_slot *, struct mbuf_slot*, seqlock_t *);
+u32 get_mbuf_slot_len(void);
+void mbuf_free_slot(struct mbuf_slot *slot);
+void mbuf_reset(struct mbuf_slot *mbuf);
 #endif
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d22c1c7fa774b9c35c8aeafcab687d47cfb44445..8b9cb125a10ed5e7f8ecb68b415edef65ef44c19 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -81,6 +81,15 @@ extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 extern struct net init_net;
 extern struct list_head sysctl_restrict_list;
 
+#ifdef CONFIG_NETNS_MBUF
+void *seq_open_net_large_private(struct inode *inode, struct file *file);
+struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode,
+                                              struct proc_dir_entry *parent,
+                                              const struct seq_operations *seq_ops,
+                                              unsigned int state_size, void *data,
+                                              const struct file_operations *proc_ops);
+#endif
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
diff --git a/include/linux/sli.h b/include/linux/sli.h
index 32c901b87e9b831b1c23ddfc19f1ce9686f0f2b7..963864f366e528099faadb64924092b23fae2df4 100755
--- a/include/linux/sli.h
+++ b/include/linux/sli.h
@@ -37,6 +37,11 @@ enum sli_schedlat_stat_item {
 	SCHEDLAT_STAT_NR
 };
 
+enum sli_iolat_stat_item {
+	IO_LAT_DELAY,
+	IO_LAT_STAT_NR
+};
+
 struct sli_memlat_stat {
 	unsigned long latency_max[MEM_LAT_STAT_NR];
 	unsigned long item[MEM_LAT_STAT_NR][LAT_COUNT_NR];
@@ -47,10 +52,16 @@ struct sli_schedlat_stat {
 	unsigned long item[SCHEDLAT_STAT_NR][LAT_COUNT_NR];
 };
 
+struct sli_iolat_stat {
+	unsigned long latency_max[IO_LAT_STAT_NR];
+	unsigned long item[IO_LAT_STAT_NR][LAT_COUNT_NR];
+};
+
 enum sli_event_type {
 	SLI_SCHED_EVENT,
 	SLI_MEM_EVENT,
 	SLI_LONGTERM_EVENT,
+	SLI_IO_EVENT,
 	SLI_EVENT_NR
 };
 
@@ -100,6 +111,10 @@ struct sli_event_monitor {
 	unsigned long long longterm_threshold[SLI_LONGTERM_NR];
 	atomic_long_t longterm_statistics[SLI_LONGTERM_NR];
 
+	unsigned long long iolat_threshold[IO_LAT_STAT_NR];
+	unsigned long long iolat_count[IO_LAT_STAT_NR];
+	atomic_long_t iolat_statistics[IO_LAT_STAT_NR];
+
 	KABI_RESERVE(1);
 	KABI_RESERVE(2);
 };
@@ -129,9 +144,17 @@ void sli_schedlat_stat(struct task_struct *task,enum sli_schedlat_stat_item sidx
 void sli_schedlat_rundelay(struct task_struct *task, struct task_struct *prev, u64 delta);
 int  sli_schedlat_stat_show(struct seq_file *m, struct cgroup *cgrp);
 int  sli_schedlat_max_show(struct seq_file *m, struct cgroup *cgrp);
+void sli_iolat_stat_end(enum sli_iolat_stat_item sidx, u64 bio_start, u64 rq_alloc_time_ns,
+		u64 rq_io_start_time_ns, u64 sli_iolat_end_time, u64 duration, struct bio *bio,
+		struct cgroup *cgrp);
+int sli_iolat_max_show(struct seq_file *m, struct cgroup *cgrp);
+int sli_iolat_stat_show(struct seq_file *m, struct cgroup *cgrp);
 ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off);
 int cgroup_sli_control_show(struct seq_file *sf, void *v);
+int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v);
+int mem_cgroup_sli_control_show(struct seq_file *sf, void *v);
+int io_cgroup_sli_control_show(struct seq_file *sf, void *v);
 void sli_check_longsys(struct task_struct *tsk);
 void sli_update_tick(struct task_struct *tsk);
 
@@ -145,5 +168,6 @@ void sli_monitor_stop(struct seq_file *seq, void *v);
 __poll_t sli_monitor_poll(struct kernfs_open_file *of, poll_table *pt);
 int sli_event_add(struct sli_notify_event *notify_event, u32 event_type, u32 levent, u32 count);
 u32 sli_monitor_signal(struct cgroup *cgrp, struct sli_notify_event *notify_event);
+DECLARE_STATIC_KEY_FALSE(sli_io_enabled);
 
 #endif /*_LINUX_SLI_H*/
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 539972f4325cdc73e03b97dc7db47c1552700ba6..de8123fea2330a0345162daceb8286eb61f2b201 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -38,6 +38,9 @@
 #include <linux/ns_common.h>
 #include <linux/idr.h>
 #include <linux/skbuff.h>
+#ifdef CONFIG_NETNS_MBUF
+#include <net/netns_mbuf.h>
+#endif
 
 struct user_namespace;
 struct proc_dir_entry;
@@ -190,6 +193,9 @@ struct net {
 	struct sock		*crypto_nlsk;
 #endif
 	struct sock		*diag_nlsk;
+#ifdef CONFIG_NETNS_MBUF
+	struct net_mbuf		mbuf;
+#endif
 } __randomize_layout;
 
 #include <linux/seq_file_net.h>
diff --git a/include/net/netns_mbuf.h b/include/net/netns_mbuf.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a272949c4a43a3df50bdafa9823a90ace330676
--- /dev/null
+++ b/include/net/netns_mbuf.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * make mbuf can be used by net namespace
+ *
+ * Author: mengensun <mengensun@tencent.com>
+ * Copyright (C) 2024 Tencent, Inc
+ */
+#ifndef __NETNS_MBUF
+#define __NETNS_MBUF
+
+#include<linux/proc_fs.h>
+#include<linux/mbuf.h>
+
+#ifdef CONFIG_NETNS_MBUF
+struct net_mbuf {
+	struct proc_dir_entry	*twatcher;
+	struct proc_dir_entry	*log;
+	struct mbuf_slot	*slot;
+};
+
+int inet_mbuf_init(void);
+void inet_mbuf_exit(void);
+ssize_t net_mbuf_print(struct net *net, const char *fmt, ...);
+#else
+static __always_inline int inet_mbuf_init(void) {return 0; }
+static __always_inline void inet_mbuf_exit(void) {}
+static __always_inline ssize_t net_mbuf_print(struct net *net, const char *fmt, ...) {return 0; };
+#endif
+#endif
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d7660fda7bd436c144a6c4147259e74bdd0cc291..63cbb6af9d2dc45e8afa87181cf370493c1cf27f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -879,6 +879,7 @@ struct tcp_skb_cb {
 			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
 			unused:5;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
+	__u32           first_xmit_time;
 	union {
 		struct {
 			/* There is space for up to 24 bytes */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f925f708e248b6f77baeb9c34bca53d197961027..9ec853feba300370e536d3b85332a40fce247ba2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3937,6 +3937,13 @@ static int cgroup_net_quality_show(struct seq_file *seq, void *v)
 #endif
 
 #ifdef CONFIG_CGROUP_SLI
+static int cgroup_sli_io_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+	return sli_iolat_stat_show(seq, cgroup);
+}
+
 static int cgroup_sli_memory_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
@@ -3957,17 +3964,21 @@ static int cgroup_sli_max_show(struct seq_file *seq, void *v)
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
 
 	sli_schedlat_max_show(seq, cgroup);
-	return sli_memlat_max_show(seq, cgroup);
+	sli_memlat_max_show(seq, cgroup);
+	return sli_iolat_max_show(seq, cgroup);
 }
 #endif
 
 void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos)
 {
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct mbuf_slot *mb = cgrp->mbuf;
 	u32 index;
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_file_ctx *ctx = of->priv;
+	struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist;
+	struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)ctx->psi.trigger;
 
-	if (!mb)
+	/* why: see cgroup_mbuf_open */
+	if (!mb->mring)
 		return NULL;
 
 	index = *pos;
@@ -3975,29 +3986,26 @@ void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos)
 	if (index && index == mb->mring->next_idx)
 		return NULL;
 
-	if (!mb->udesc) {
-		mb->udesc = kmalloc(sizeof(struct mbuf_user_desc), GFP_KERNEL);
-
-		if (!mb->udesc)
-			goto out;
-
-		mb->udesc->user_idx = mb->mring->first_idx;
-		mb->udesc->user_seq = mb->mring->first_seq;
-	}
+	udesc->user_idx = mb->mring->first_idx;
+	udesc->user_seq = mb->mring->first_seq;
 
 	/* Maybe reach end or empty */
-	if (mb->udesc->user_idx == mb->mring->next_idx)
+	if (udesc->user_idx == mb->mring->next_idx)
 		return NULL;
 
-out:
-	return mb->udesc;
+	return udesc;
 }
 
 void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct mbuf_slot *mb = cgrp->mbuf;
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_file_ctx *ctx = of->priv;
+	struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist;
+
+	/* why: see cgroup_mbuf_open */
+	if (!mb->mring)
+		return NULL;
 
 	udesc->user_idx = mb->ops->next(mb->mring, udesc->user_idx);
 	*pos = udesc->user_idx;
@@ -4008,26 +4016,19 @@ void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos)
 	return udesc;
 }
 
-void cgroup_mbuf_stop(struct seq_file *s, void *v)
-{
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct mbuf_user_desc *desc;
-
-	if (cgrp->mbuf) {
-		desc = cgrp->mbuf->udesc;
-		if(desc && desc->user_idx == cgrp->mbuf->mring->next_idx) {
-			kfree(cgrp->mbuf->udesc);
-			cgrp->mbuf->udesc = NULL;
-		}
-	}
-}
+void cgroup_mbuf_stop(struct seq_file *s, void *v) { }
 
 int cgroup_mbuf_show(struct seq_file *s, void *v)
 {
 	ssize_t ret;
 	struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct mbuf_slot *mb = cgrp->mbuf;
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_file_ctx *ctx = of->priv;
+	struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist;
+
+	/* why: see cgroup_mbuf_open */
+	if (!mb->mring)
+		return 0;
 
 	memset(udesc->buf, 0, sizeof(udesc->buf));
 	ret = mb->ops->read(mb, udesc);
@@ -4038,6 +4039,56 @@ int cgroup_mbuf_show(struct seq_file *s, void *v)
 	return 0;
 }
 
+extern u32 get_mbuf_slot_len(void);
+int cgroup_mbuf_open(struct kernfs_open_file *of)
+{
+	struct cgroup_file_ctx *ctx = of->priv;
+	struct mbuf_slot *mb = seq_css(of->seq_file)->cgroup->mbuf;
+	u32 mbuf_slot_len;
+
+	/* use ctx->psi.trigger for mbuf_user_desc */
+	ctx->psi.trigger = kzalloc(sizeof(struct mbuf_user_desc), GFP_KERNEL);
+	if (!ctx->psi.trigger)
+		return -ENOMEM;
+
+	mbuf_slot_len = get_mbuf_slot_len();
+	/* use ctx->procs1.pidlist for mbuf_slot snapshot */
+	ctx->procs1.pidlist = vmalloc(mbuf_slot_len);
+	if (!ctx->procs1.pidlist) {
+		kfree(ctx->psi.trigger);
+		ctx->psi.trigger = NULL;
+		return -ENOMEM;
+	}
+	memset(ctx->procs1.pidlist, 0, mbuf_slot_len);
+
+	/* cgroup may have no mbuf attached, because the mbuf pool
+	 * has a max num
+	 * here we let file open success, so, seq_ops must
+	 * check mring point
+	 */
+	if (!mb)
+		return 0;
+
+	snapshot_mbuf((struct mbuf_slot *)ctx->procs1.pidlist, mb, &mb->slot_lock);
+
+	return 0;
+}
+
+void cgroup_mbuf_release(struct kernfs_open_file *of)
+{
+	struct cgroup_file_ctx *ctx = of->priv;
+
+	if (ctx->psi.trigger) {
+		kfree(ctx->psi.trigger);
+		ctx->psi.trigger = NULL;
+	}
+
+	if (ctx->procs1.pidlist) {
+		vfree(ctx->procs1.pidlist);
+		ctx->procs1.pidlist = NULL;
+	}
+}
+
 /*
  * Get cgroup struct from task_struct for mbuf and sli.
  *
@@ -4082,7 +4133,7 @@ ssize_t mbuf_print_task(struct task_struct *task, const char *fmt, ...)
 
 	if (mb->ops) {
 		va_start(args, fmt);
-		mb->ops->write(cgrp, fmt, args);
+		mb->ops->write(mb, fmt, args);
 		va_end(args);
 	}
 
@@ -4106,7 +4157,7 @@ ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...)
 
 	if (mb->ops) {
 		va_start(args, fmt);
-		mb->ops->write(cgrp, fmt, args);
+		mb->ops->write(mb, fmt, args);
 		va_end(args);
 	}
 
@@ -5645,16 +5696,23 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_net_quality_show,
 		.release = cgroup_net_release,
 	},
-#endif
 	{
 		.name = "mbuf",
 		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_mbuf_open,
 		.seq_show = cgroup_mbuf_show,
 		.seq_start = cgroup_mbuf_start,
 		.seq_next = cgroup_mbuf_next,
 		.seq_stop = cgroup_mbuf_stop,
+		.release = cgroup_mbuf_release,
 	},
+#endif
 #ifdef CONFIG_CGROUP_SLI
+	{
+		.name = "sli.io",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_sli_io_show,
+	},
 	{
 		.name = "sli.memory",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -6141,6 +6199,11 @@ static inline bool cgroup_need_mbuf(struct cgroup *cgrp)
 		return true;
 #endif
 
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
+	if (cgroup_css(cgrp, cgroup_subsys[io_cgrp_id]))
+		return true;
+#endif
+
 	return false;
 }
 
@@ -6399,10 +6462,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	if (cgrp->mbuf)
 		mbuf_free(cgrp);
 
-#ifdef CONFIG_CGROUP_SLI
-	if (cgrp->sctx)
-		sctx_free(cgrp);
-#endif
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
 
diff --git a/kernel/cgroup/mbuf.c b/kernel/cgroup/mbuf.c
index 6d7cb88273aa04e2ffeab1a383099c07a934e619..78f81312d455725e24f43bfa30310c9fd2cece40 100644
--- a/kernel/cgroup/mbuf.c
+++ b/kernel/cgroup/mbuf.c
@@ -1,11 +1,11 @@
-//  SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  Quality Monitor Buffer
  *  Aim to provide backup buffer for RQM to record critical message.
  *  Could be used to catch critical context when abnormal jitters occur.
  *
- *	Author: bauerchen <bauerchen@tencent.com>
- *	Copyright (C) 2021 Tencent, Inc
+ *	Author: mengensun <mengensun@tencent.com>
+ *	Copyright (C) 2024 Tencent, Inc
  */
 
 #include <linux/kernel.h>
@@ -21,14 +21,20 @@
 /* Define max mbuf len is 8M, and min is 2M */
 #define MBUF_LEN_MAX (1 << 23)
 #define MBUF_LEN_MIN (1 << 21)
-#define MBUF_LEN_DEF MBUF_LEN_MIN
+/*
+ * from now, every netns has a mbuf, because
+ * change the mbuf slot size is dangerous, so
+ * double the total buffer size to double
+ * total mbuf slot num (see MBUF_SLOTS_DEF)
+ */
+#define MBUF_LEN_DEF (1 << 22)
 
 #define MBUF_MSG_LEN_MAX 1024
 
 /* Monitor buffer support max 1024 items */
 #define MBUF_SLOTS_MAX	1024
 #define MBUF_SLOTS_MIN	256
-#define MBUF_SLOTS_DEF  512
+#define MBUF_SLOTS_DEF  1024
 
 /* Global mbuf metadata struct */
 static struct mbuf_struct g_mbuf = {
@@ -51,7 +57,7 @@ static void __init mbuf_len_update(u64 size)
 				(u64)MBUF_LEN_MAX);
 	}
 
-	if (size < MBUF_LEN_MIN){
+	if (size < MBUF_LEN_MIN) {
 		size = (u64) MBUF_LEN_MIN;
 		pr_warn("mbuf: monitor buffer less [ %llu ] is not supported.\n",
 				(u64) MBUF_LEN_MIN);
@@ -112,7 +118,7 @@ void __init mbuf_bmap_init(void)
 					   L1_CACHE_BYTES);
 	mbuf_bitmap = kmalloc(alloc_size, __GFP_HIGH|__GFP_ZERO);
 
-	if(!mbuf_bitmap){
+	if (!mbuf_bitmap) {
 		pr_err("mbuf: alloc mbuf_bitmap failed!\n");
 		return;
 	}
@@ -159,7 +165,7 @@ static u32 mbuf_next(struct mbuf_ring *mring, u32 curr_idx)
 	 * just goto head
 	 */
 	frees = mring->end_idx - next_idx;
-	if(frees < sizeof(struct mbuf_ring_desc)){
+	if (frees < sizeof(struct mbuf_ring_desc)) {
 		next_idx = mring->base_idx;
 		goto next;
 	}
@@ -224,9 +230,8 @@ static int mbuf_prepare(struct mbuf_ring *mring, u32 msg_size)
 {
 	u32 frees;
 
-	if (unlikely(msg_size > MBUF_MSG_LEN_MAX)) {
+	if (unlikely(msg_size > MBUF_MSG_LEN_MAX))
 		return -ENOMEM;
-	}
 
 	while (mring->first_seq < mring->next_seq) {
 
@@ -247,26 +252,26 @@ static int mbuf_prepare(struct mbuf_ring *mring, u32 msg_size)
 }
 
 /* Write monitor buffer message */
-static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
+static ssize_t do_mbuf_write(struct mbuf_slot *mbuf, char *buffer, size_t size)
 {
 	struct mbuf_ring *mring;
 	struct mbuf_ring_desc *desc;
 	size_t len;
 	unsigned long flags;
 
-	if (size >= g_mbuf.mbuf_size_per_cg){
+	if (size >= g_mbuf.mbuf_size_per_cg) {
 		pr_err("mbuf: write message need less than [ %u ] bytes\n",
 				g_mbuf.mbuf_size_per_cg);
 		return 0;
 	}
 
-	mring = cg->mbuf->mring;
+	mring = mbuf->mring;
 	len = sizeof(struct mbuf_ring_desc) + size;
 
-	spin_lock_irqsave(&cg->mbuf->slot_lock, flags);
+	write_seqlock_irqsave(&mbuf->slot_lock, flags);
 
-	if (mbuf_prepare(mring, len)){
-		spin_unlock_irqrestore(&cg->mbuf->slot_lock, flags);
+	if (mbuf_prepare(mring, len)) {
+		write_sequnlock_irqrestore(&mbuf->slot_lock, flags);
 		pr_err("mbuf: Can not find enough space.\n");
 		return 0;
 	}
@@ -285,20 +290,25 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
 	mring->next_idx += desc->len;
 	mring->next_seq++;
 
-	spin_unlock_irqrestore(&cg->mbuf->slot_lock, flags);
+	write_sequnlock_irqrestore(&mbuf->slot_lock, flags);
 
 	return size;
 }
 
-void mbuf_reset(struct mbuf_ring *mring)
+void mbuf_reset(struct mbuf_slot *mbuf)
 {
-	mring->first_idx = mring->base_idx;
-	mring->first_seq = 0;
-	mring->next_idx = mring->base_idx;
-	mring->next_seq = 0;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&mbuf->slot_lock, flags);
+	mbuf->mring->first_idx = mbuf->mring->base_idx;
+	mbuf->mring->first_seq = 0;
+	mbuf->mring->next_idx = mbuf->mring->base_idx;
+	mbuf->mring->next_seq = 0;
+	write_sequnlock_irqrestore(&mbuf->slot_lock, flags);
 }
+EXPORT_SYMBOL(mbuf_reset);
 
-static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args)
+static ssize_t mbuf_write(struct mbuf_slot *mbuf, const char *fmt, va_list args)
 {
 	static char buf[MBUF_MSG_LEN_MAX];
 	char *text = buf;
@@ -308,7 +318,7 @@ static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args)
 	t_len = vscnprintf(text, sizeof(buf), fmt, args);
 
 	/* Write string to mbuf */
-	ret = do_mbuf_write(cg, text, t_len);
+	ret = do_mbuf_write(mbuf, text, t_len);
 
 	return ret;
 }
@@ -330,23 +340,30 @@ static int get_next_mbuf_id(unsigned long *addr, u32 start)
 	return index;
 }
 
-static void mbuf_slot_init(struct mbuf_slot *mb, struct cgroup *cg, u32 index)
+static void mbuf_slot_init(struct mbuf_slot *mb,
+			   void *owner, u32 index, struct mbuf_operations *ops)
 {
-	mb->owner = cg;
+	mb->owner = owner;
 	mb->idx = index;
-	mb->ops = &mbuf_ops;
-	spin_lock_init(&mb->slot_lock);
-	ratelimit_state_init(&mb->ratelimit, 5 * HZ,50);
+
+	if (!ops)
+		mb->ops = &mbuf_ops;
+	else
+		mb->ops = ops;
+
+	seqlock_init(&mb->slot_lock);
+	ratelimit_state_init(&mb->ratelimit, 5 * HZ, 50);
 
 	mb->mring = (struct mbuf_ring *)((char *)mb + sizeof(struct mbuf_slot));
-	mb->mring->base_idx = index *
-				g_mbuf.mbuf_size_per_cg + sizeof(struct mbuf_slot) + sizeof(struct mbuf_ring);
+	mb->mring->base_idx = index * g_mbuf.mbuf_size_per_cg
+				+ sizeof(struct mbuf_slot)
+				+ sizeof(struct mbuf_ring);
 	mb->mring->end_idx = (index + 1) * g_mbuf.mbuf_size_per_cg - 1;
 
-	mbuf_reset(mb->mring);
+	mbuf_reset(mb);
 }
 
-struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg)
+struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops)
 {
 	struct mbuf_slot *mb;
 	u32 index = 0;
@@ -395,25 +412,158 @@ struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg)
 	g_mbuf.mbuf_next_id = index;
 
 	mb = (struct mbuf_slot *)(g_mbuf.mbuf + index * g_mbuf.mbuf_size_per_cg);
-	mbuf_slot_init(mb, cg, index);
+	mbuf_slot_init(mb, owner, index, ops);
 	g_mbuf.mbuf_frees--;
 
 	spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags);
 
 	return mb;
 }
+EXPORT_SYMBOL(mbuf_slot_alloc_v2);
 
-void mbuf_free(struct cgroup *cg)
+struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg)
+{
+	return mbuf_slot_alloc_v2((void *)cg, NULL);
+}
+EXPORT_SYMBOL(mbuf_slot_alloc);
+
+void mbuf_free_slot(struct mbuf_slot *slot)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&g_mbuf.mbuf_lock, flags);
-
 	/* Make current idx the next available buffer */
-	g_mbuf.mbuf_next_id = cg->mbuf->idx;
+	g_mbuf.mbuf_next_id = slot->idx;
 	__clear_bit(g_mbuf.mbuf_next_id, g_mbuf.mbuf_bitmap);
-
 	g_mbuf.mbuf_frees++;
 	spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags);
+
+}
+EXPORT_SYMBOL(mbuf_free_slot);
+
+void mbuf_free(struct cgroup *cg)
+{
+	unsigned long flags;
+	struct mbuf_slot *slot;
+
+	spin_lock_irqsave(&cg->cgrp_mbuf_lock, flags);
+	slot = cg->mbuf;
+	cg->mbuf = NULL;
+	spin_unlock_irqrestore(&cg->cgrp_mbuf_lock, flags);
+
+	mbuf_free_slot(slot);
+}
+
+static u32 rd_mbuf_next(struct mbuf_ring *mring, u32 curr_idx)
+{
+	struct mbuf_ring_desc *cdesc, *ndesc;
+	u32 frees, next_idx;
+	void *start;
+
+	start = (void *)(mring + 1);
+	cdesc = (struct mbuf_ring_desc *)(start + curr_idx);
+	next_idx = curr_idx + cdesc->len;
+
+	frees = mring->end_idx - next_idx;
+	if (frees < sizeof(struct mbuf_ring_desc)) {
+		/* end */
+		if (next_idx == mring->next_idx)
+			return next_idx;
+
+		/*buffer wrapped to head */
+		next_idx = mring->base_idx;
+		goto next;
+	}
+
+	ndesc = (struct mbuf_ring_desc *)(start + next_idx);
+
+	/* same magic can't be said */
+	if (!ndesc->len && next_idx != mring->next_idx)
+		next_idx = mring->base_idx;
+next:
+	return next_idx;
 }
 
+static ssize_t rd_mbuf_read(struct mbuf_slot *mb, struct mbuf_user_desc *udesc)
+{
+	struct mbuf_ring_desc *desc;
+	ssize_t ret;
+	size_t i, len, tbuf_len;
+	char *start;
+
+	tbuf_len = sizeof(udesc->buf);
+	start = (char *)(mb->mring + 1);
+	desc = (struct mbuf_ring_desc *)(start + udesc->user_idx);
+
+	len = sprintf(udesc->buf, "%llu:", desc->ts_ns);
+	start = (char *)(desc + 1);
+
+	for (i = 0; i < desc->text_len; i++) {
+		unsigned char c = start[i];
+
+		if (c < ' ' || c >= 127 || c == '\\')
+			continue;
+		else
+			udesc->buf[len++] = c;
+		if (len >= tbuf_len)
+			break;
+	}
+
+	len = len >= tbuf_len ? tbuf_len - 1 : len;
+	udesc->buf[len] = '\n';
+	udesc->user_seq++;
+	ret = len;
+	return ret;
+}
+
+/* this ops is just for read-side abi of mbuf, mbuf has a write ops
+ * which is protect by spinlock, while there is no read-write side
+ * protection.
+ * you can use like follow:
+ *
+ * called snapshot_mbuf copy data from mbuf to the `dst`. then read
+ * the dst use the following ops
+ *
+ * all the index is offset from the end point of mring of the
+ * snapshot, instead of from the global mbuf memory pool
+ *
+ * btw: the private data of seq file is the ideal place to hold the
+ * snapshot
+ */
+const struct mbuf_operations rd_mbuf_ops = {
+	.read		= rd_mbuf_read,
+	.next		= rd_mbuf_next,
+};
+
+void snapshot_mbuf(struct mbuf_slot *dst, struct mbuf_slot *src, seqlock_t *lock)
+{
+	unsigned int seq;
+
+	do {
+		/* the peer of the lock is write-side, we want writer
+		 * go first when there is confliction, and this reader
+		 * retry to read go get a consistent buf snapshot
+		 */
+		cond_resched();
+		seq = read_seqbegin(lock);
+		memcpy((void *)dst, (void *)src, g_mbuf.mbuf_size_per_cg);
+	} while (read_seqretry(lock, seq));
+
+	/* all the ops in `rd_mbuf_ops` see a idx offset from the end
+	 * point of mring. so here adjust the idx as a whole
+	 */
+	dst->mring = (struct mbuf_ring *)(dst + 1);
+	dst->mring->end_idx = dst->mring->end_idx - dst->mring->base_idx;
+	dst->mring->first_idx = dst->mring->first_idx - dst->mring->base_idx;
+	dst->mring->next_idx = dst->mring->next_idx - dst->mring->base_idx;
+	dst->mring->base_idx = 0;
+	dst->ops = &rd_mbuf_ops;
+}
+EXPORT_SYMBOL(snapshot_mbuf);
+
+/* the mbuf size per cg is not changed once the system booted up */
+u32 get_mbuf_slot_len(void)
+{
+	return g_mbuf.mbuf_size_per_cg;
+}
+EXPORT_SYMBOL(get_mbuf_slot_len);
diff --git a/kernel/cgroup/sli.c b/kernel/cgroup/sli.c
index faa4756b73e52a87af9e71834e7184a3d408a204..71e9a58b6c0cc7894d515ed65ca9d0ab35965c8a 100755
--- a/kernel/cgroup/sli.c
+++ b/kernel/cgroup/sli.c
@@ -18,6 +18,7 @@
 #define MAX_STACK_TRACE_DEPTH	64
 
 static DEFINE_STATIC_KEY_FALSE(sli_enabled);
+DEFINE_STATIC_KEY_FALSE(sli_io_enabled);
 static DEFINE_STATIC_KEY_FALSE(sli_monitor_enabled);
 
 static struct sli_event_monitor default_sli_event_monitor;
@@ -59,12 +60,17 @@ static const char *longterm_threshold_name[] = {
 	"longterm_irqtime_threshold="
 };
 
+static const char *iolat_threshold_name[] = {
+	"iolat_delay_threshold="
+};
+
 static const char *sanity_check_abbr[] = {
 	"schedlat_",
 	"memlat_",
 	"longterm_",
 	"period=",
-	"mbuf_enable="
+	"mbuf_enable=",
+	"iolat_"
 };
 
 static void sli_proactive_monitor_work(struct work_struct *work);
@@ -96,6 +102,8 @@ static void sli_event_monitor_init(struct sli_event_monitor *event_monitor, stru
 	memset(&event_monitor->memlat_threshold, 0xff, sizeof(event_monitor->memlat_threshold));
 	memset(&event_monitor->memlat_count, 0xff, sizeof(event_monitor->memlat_count));
 	memset(&event_monitor->longterm_threshold, 0xff, sizeof(event_monitor->longterm_threshold));
+	memset(&event_monitor->iolat_threshold, 0xff, sizeof(event_monitor->iolat_threshold));
+	memset(&event_monitor->iolat_count, 0xff, sizeof(event_monitor->iolat_count));
 
 	event_monitor->last_update = jiffies;
 	event_monitor->cgrp = cgrp;
@@ -153,6 +161,12 @@ static int sli_event_inherit(struct cgroup *cgrp)
 				&cgrp_event_monitor->longterm_statistics[new_event->event_id],
 				sli_get_longterm_statistics(cgrp, new_event->event_id));
 			break;
+		case SLI_IO_EVENT:
+			cgrp_event_monitor->iolat_threshold[new_event->event_id] =
+				READ_ONCE(event_monitor->iolat_threshold[new_event->event_id]);
+			cgrp_event_monitor->iolat_count[new_event->event_id] =
+				READ_ONCE(event_monitor->iolat_count[new_event->event_id]);
+			break;
 		default:
 			printk(KERN_ERR "%s: invalid sli_event type!\n", __func__);
 			goto failed;
@@ -182,13 +196,12 @@ static int sli_event_inherit(struct cgroup *cgrp)
 }
 
 static void store_task_stack(struct task_struct *task, char *reason,
-			     u64 duration, unsigned int skipnr)
+			     u64 duration, unsigned int skipnr, struct cgroup *cgrp)
 {
 	unsigned long *entries;
 	unsigned nr_entries = 0;
 	unsigned long flags;
 	int i;
-	struct cgroup *cgrp;
 
 	entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
 				GFP_ATOMIC);
@@ -197,7 +210,6 @@ static void store_task_stack(struct task_struct *task, char *reason,
 
 	nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, skipnr);
 
-	cgrp = get_cgroup_from_task(task);
 	spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags);
 
 	mbuf_print(cgrp, "record reason:%s comm:%s pid:%d duration=%lld\n",
@@ -242,6 +254,21 @@ static char * get_memlat_name(enum sli_memlat_stat_item sidx)
 	return name;
 }
 
+static char *get_iolat_name(enum sli_iolat_stat_item sidx)
+{
+	char *name = NULL;
+
+	switch (sidx) {
+	case IO_LAT_DELAY:
+		name = "iolat_delay";
+		break;
+	default:
+		break;
+	}
+
+	return name;
+}
+
 static enum sli_lat_count get_lat_count_idx(u64 duration)
 {
 	enum sli_lat_count idx;
@@ -428,7 +455,7 @@ void sli_memlat_stat_end(enum sli_memlat_stat_item sidx, u64 start)
 				char *lat_name;
 
 				lat_name = get_memlat_name(sidx);
-				store_task_stack(current, lat_name, duration, 0);
+				store_task_stack(current, lat_name, duration, 0, cgrp);
 			}
 		}
 	}
@@ -437,6 +464,106 @@ void sli_memlat_stat_end(enum sli_memlat_stat_item sidx, u64 start)
 	rcu_read_unlock();
 }
 
+static u64 sli_iolat_stat_gather(struct cgroup *cgrp,
+				 enum sli_iolat_stat_item sidx,
+				 enum sli_lat_count cidx)
+{
+	u64 sum = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		sum += per_cpu_ptr(cgrp->sli_iolat_stat_percpu, cpu)->item[sidx][cidx];
+
+	return sum;
+}
+
+int sli_iolat_stat_show(struct seq_file *m, struct cgroup *cgrp)
+{
+	enum sli_iolat_stat_item sidx;
+
+	if (!static_branch_likely(&sli_io_enabled)) {
+		seq_printf(m, "sli_io is not enabled, please echo 1 > /proc/sli/sli_io_enabled\n");
+		return 0;
+	}
+
+	if (!cgrp->sli_iolat_stat_percpu)
+		return 0;
+
+	for (sidx = IO_LAT_DELAY; sidx < IO_LAT_STAT_NR; sidx++) {
+		seq_printf(m, "%s:\n", get_iolat_name(sidx));
+		seq_printf(m, "0-1ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_0_1));
+		seq_printf(m, "1-4ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_1_4));
+		seq_printf(m, "4-8ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_4_8));
+		seq_printf(m, "8-16ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_8_16));
+		seq_printf(m, "16-32ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_16_32));
+		seq_printf(m, "32-64ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_32_64));
+		seq_printf(m, "64-128ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_64_128));
+		seq_printf(m, ">=128ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_128_INF));
+	}
+
+	return 0;
+}
+
+int sli_iolat_max_show(struct seq_file *m, struct cgroup *cgrp)
+{
+	enum sli_iolat_stat_item sidx;
+
+	if (!static_branch_likely(&sli_io_enabled)) {
+		seq_printf(m, "sli_io is not enabled, please echo 1 > /proc/sli/sli_io_enabled\n");
+		return 0;
+	}
+
+	if (!cgrp->sli_iolat_stat_percpu)
+		return 0;
+
+	for (sidx = IO_LAT_DELAY; sidx < IO_LAT_STAT_NR; sidx++) {
+		int cpu;
+		unsigned long latency_sum = 0;
+
+		for_each_possible_cpu(cpu)
+			latency_sum += per_cpu_ptr(cgrp->sli_iolat_stat_percpu, cpu)->latency_max[sidx];
+
+		seq_printf(m, "%s: %lu\n", get_iolat_name(sidx), latency_sum);
+	}
+
+	return 0;
+}
+
+void sli_iolat_stat_end(enum sli_iolat_stat_item sidx, u64 bio_start, u64 rq_alloc_time_ns,
+		u64 rq_io_start_time_ns, u64 sli_iolat_end_time, u64 duration, struct bio *bio,
+		struct cgroup *cgrp)
+{
+	enum sli_lat_count cidx;
+
+	cidx = get_lat_count_idx(duration);
+	duration = duration >> 10;
+	this_cpu_inc(cgrp->sli_iolat_stat_percpu->item[sidx][cidx]);
+	this_cpu_add(cgrp->sli_iolat_stat_percpu->latency_max[sidx], duration);
+
+	if (static_branch_unlikely(&sli_monitor_enabled)) {
+		struct sli_event_monitor *event_monitor = cgrp->cgrp_event_monitor;
+
+		if (duration < READ_ONCE(event_monitor->iolat_threshold[sidx]))
+			return;
+
+		atomic_long_inc(&event_monitor->iolat_statistics[sidx]);
+		if (event_monitor->mbuf_enable) {
+			char *lat_name;
+			unsigned long flags;
+			char b[BDEVNAME_SIZE];
+
+			lat_name = get_iolat_name(sidx);
+			spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags);
+			mbuf_print(cgrp, "record reason:%s devname:%s duration_us=%lld "
+				   "bio_start=%llu req_start=%llu req_issue=%llu "
+				   "bio_complete=%llu\n", lat_name, bio_devname(bio, b),
+				   duration, bio_start, rq_alloc_time_ns,
+				   rq_io_start_time_ns, sli_iolat_end_time);
+			spin_unlock_irqrestore(&cgrp->cgrp_mbuf_lock, flags);
+		}
+	}
+}
+
 void sli_schedlat_stat(struct task_struct *task, enum sli_schedlat_stat_item sidx, u64 delta)
 {
 	struct cgroup *cgrp = NULL;
@@ -465,7 +592,7 @@ void sli_schedlat_stat(struct task_struct *task, enum sli_schedlat_stat_item sid
 				char *lat_name;
 
 				lat_name = get_schedlat_name(sidx);
-				store_task_stack(task, lat_name, delta, 0);
+				store_task_stack(task, lat_name, delta, 0, cgrp);
 			}
 		}
 	}
@@ -627,6 +754,20 @@ static void sli_proactive_monitor_work(struct work_struct *work)
 				sli_event_add(notify_event, event->event_type,
 					      event->event_id, (int)(statistics - last_statistics));
 			break;
+		case SLI_IO_EVENT:
+			statistics = (u64)atomic_long_read(
+					&event_monitor->iolat_statistics[event->event_id]);
+			atomic_long_set(&event_monitor->iolat_statistics[event->event_id], 0);
+
+			if (event_monitor->overrun) {
+				event_monitor->overrun = 0;
+				break;
+			}
+
+			if (statistics >= READ_ONCE(event_monitor->iolat_count[event->event_id]))
+				sli_event_add(notify_event, event->event_type,
+					      event->event_id, statistics);
+			break;
 		default:
 			break;
 		}
@@ -640,64 +781,98 @@ static void sli_proactive_monitor_work(struct work_struct *work)
 	css_put(&event_monitor->cgrp->self);
 }
 
+struct cgroup *get_cgroup_from_task_id(struct task_struct *task, int event_nr)
+{
+	int id;
+	struct cgroup *cgrp;
+
+	id = cpuacct_cgrp_id;
+	switch (event_nr) {
+#if IS_ENABLED(CONFIG_MEMCG)
+	case SLI_MEM_EVENT:
+		id = memory_cgrp_id;
+		break;
+#endif
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
+	case SLI_IO_EVENT:
+		id = io_cgrp_id;
+		break;
+#endif
+	default:
+		break;
+	}
+
+	/* First, try to get cpuacct/mem cgroup for V1*/
+	cgrp = task_cgroup(task, id);
+	if (cgrp && cgrp->level)
+		return cgrp;
+
+	/*
+	 * If can not find cpuacct/mem cgroup or cpuacct/mem cgroup is root, just return
+	 * dfl_cgrp.
+	 */
+	cgrp = task_dfl_cgroup(task);
+
+	return cgrp;
+}
+
 void sli_update_tick(struct task_struct *tsk)
 {
 	struct cgroup *cgrp;
+	int i;
 
 	if (!static_branch_likely(&sli_monitor_enabled))
 		return;
 
 	rcu_read_lock();
 
-	cgrp = get_cgroup_from_task(tsk);
-	if (cgrp && cgroup_parent(cgrp)) {
-		bool ret;
-		int period;
-		unsigned long long old_value, last_update;
+	for (i = 0; i < SLI_EVENT_NR; i++) {
+		cgrp = get_cgroup_from_task_id(tsk, i);
 
-		period = cgrp->cgrp_event_monitor->period;
-		if (!period)
-			goto unlock;
+		if (cgrp && cgroup_parent(cgrp)) {
+			bool ret;
+			int period;
+			unsigned long long old_value, last_update;
 
-retry:
-		last_update = READ_ONCE(cgrp->cgrp_event_monitor->last_update);
-		if (time_after((unsigned long)(period + last_update), jiffies))
-			goto unlock;
+			period = cgrp->cgrp_event_monitor->period;
+			if (!period)
+				continue;
 
-		old_value = cmpxchg(&cgrp->cgrp_event_monitor->last_update,
-				    last_update, jiffies);
-		if (old_value != last_update)
-			goto retry;
+retry:
+			last_update = READ_ONCE(cgrp->cgrp_event_monitor->last_update);
+			if (time_after((unsigned long)(period + last_update), jiffies))
+				continue;
 
-		/*
-		 * Current jiffies should be somewhere between period and 8 * period,
-		 * otherwise we consider the it is overrun and should be abandoned.
-		 */
-		if (time_before((unsigned long)((period << 3) + last_update), jiffies))
-			cgrp->cgrp_event_monitor->overrun = 1;
+			old_value = cmpxchg(&cgrp->cgrp_event_monitor->last_update,
+					last_update, jiffies);
+			if (old_value != last_update)
+				goto retry;
 
-		rcu_read_unlock();
+			/*
+			 * Current jiffies should be somewhere between period and 8 * period,
+			 * otherwise we consider the it is overrun and should be abandoned.
+			 */
+			if (time_before((unsigned long)((period << 3) + last_update), jiffies))
+				cgrp->cgrp_event_monitor->overrun = 1;
 
-		ret = css_tryget(&cgrp->self);
-		if (!ret)
-			return;
+			ret = css_tryget(&cgrp->self);
+			if (!ret)
+				continue;
 
-		/*
-		 * The sli trace work may have a lot a work to do, and should send
-		 * the event to polling tasks. So we don't do the work in interrupt
-		 * context(put the work to the workqueue).
-		 */
-		ret = queue_work(sli_workqueue, &cgrp->cgrp_event_monitor->sli_event_work);
-		/*
-		 * If work had been pushed to workqueue and not been executed, there is no
-		 * need to push it again. So we must put the css refcount.
-		 */
-		if (!ret)
-			css_put(&cgrp->self);
-		return;
+			/*
+			 * The sli trace work may have a lot a work to do, and should send
+			 * the event to polling tasks. So we don't do the work in interrupt
+			 * context(put the work to the workqueue).
+			 */
+			ret = queue_work(sli_workqueue, &cgrp->cgrp_event_monitor->sli_event_work);
+			/*
+			 * If work had been pushed to workqueue and not been executed, there is no
+			 * need to push it again. So we must put the css refcount.
+			 */
+			if (!ret)
+				css_put(&cgrp->self);
+		}
 	}
-
-unlock:
 	rcu_read_unlock();
 }
 
@@ -780,7 +955,8 @@ static unsigned long sli_get_longterm_statistics(struct cgroup *cgrp,
 	return latency_sum;
 }
 
-static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec)
+static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec,
+		int index)
 {
 	char *str;
 	int i, len, ret;
@@ -788,15 +964,21 @@ static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec)
 
 	/* Replace the delimiter with '\0' */
 	len = strlen(buf);
-	for (i = 0; i < len; i++) {
-		if (buf[i] == ',' || buf[i] == ' ') {
-			buf[i] = '\0';
-			break;
+	if (len == 0)
+		return -EINVAL;
+
+	/* longterm_rundelay/irqtime dont need check */
+	if (index != 2) {
+		for (i = 0; i < len; i++) {
+			if (buf[i] == ',' || buf[i] == ' ') {
+				buf[i] = '\0';
+				break;
+			}
 		}
-	}
 
-	if (i == len)
-		return -EINVAL;
+		if (i == len)
+			return -EINVAL;
+	}
 
 	/* Parse the value for theshold */
 	ret = kstrtou64(buf, 0, &value);
@@ -805,6 +987,10 @@ static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec)
 
 	sec->threshold = sli_convert_value(value, false);
 
+	/* longterm_rundelay/irqtime dont need count= param*/
+	if (index == 2)
+		return 0;
+
 	/* Move the pointer to the positon which after the delimiter */
 	buf += (i + 1);
 	len -= (i + 1);
@@ -842,7 +1028,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 			return -EINVAL;
 
 		buf += min_len;
-		ret = sli_parse_threshold(buf, sec);
+		ret = sli_parse_threshold(buf, sec, index);
 		if (ret)
 			return ret;
 
@@ -860,7 +1046,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 			return -EINVAL;
 
 		buf += min_len;
-		ret = sli_parse_threshold(buf, sec);
+		ret = sli_parse_threshold(buf, sec, index);
 		if (ret)
 			return ret;
 
@@ -878,7 +1064,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 			return -EINVAL;
 
 		buf += min_len;
-		ret = sli_parse_threshold(buf, sec);
+		ret = sli_parse_threshold(buf, sec, index);
 		if (ret)
 			return ret;
 
@@ -906,7 +1092,24 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec
 
 		sec->mbuf_enable = !!value;
 		break;
+	case 5:
+		for (i = 0; i < ARRAY_SIZE(iolat_threshold_name); i++) {
+			min_len = min(len, (int)strlen((const char *)iolat_threshold_name[i]));
+			if (!strncmp(iolat_threshold_name[i], buf, min_len))
+				break;
+		}
+
+		if (i == ARRAY_SIZE(iolat_threshold_name))
+			return -EINVAL;
+
+		buf += min_len;
+		ret = sli_parse_threshold(buf, sec, index);
+		if (ret)
+			return ret;
 
+		sec->event_type = SLI_IO_EVENT;
+		sec->event_id = i;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1054,6 +1257,14 @@ ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
 						sli_get_longterm_statistics(cgrp, sec.event_id));
 			ret = sli_event_update(event_monitor, &sec, last_threshold);
 			break;
+		case SLI_IO_EVENT:
+			last_threshold = event_monitor->iolat_threshold[sec.event_id];
+			WRITE_ONCE(event_monitor->iolat_threshold[sec.event_id], sec.threshold);
+			WRITE_ONCE(event_monitor->iolat_count[sec.event_id], sec.count);
+			smp_wmb();
+			atomic_long_set(&event_monitor->iolat_statistics[sec.event_id], 0);
+			ret = sli_event_update(event_monitor, &sec, last_threshold);
+			break;
 		default:
 			break;
 		}
@@ -1067,6 +1278,115 @@ ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
 	return ret;
 }
 
+int io_cgroup_sli_control_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long long threshold, count;
+	struct cgroup *cgrp;
+	struct sli_event_monitor *event_monitor;
+
+	cgrp = seq_css(sf)->cgroup;
+	if (cgroup_parent(cgrp))
+		event_monitor = cgrp->cgrp_event_monitor;
+	else
+		event_monitor = &default_sli_event_monitor;
+
+	inode_lock_shared(file_inode(sf->file));
+	seq_printf(sf, "period: %d\n", event_monitor->period);
+	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
+
+	for (i = 0; i < IO_LAT_STAT_NR; i++) {
+		threshold = sli_convert_value(event_monitor->iolat_threshold[i], true);
+		count = sli_convert_value(event_monitor->iolat_count[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i),
+			   threshold, count);
+	}
+
+	inode_unlock_shared(file_inode(sf->file));
+	return 0;
+}
+
+int mem_cgroup_sli_control_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long long threshold, count;
+	struct cgroup *cgrp;
+	struct sli_event_monitor *event_monitor;
+
+	cgrp = seq_css(sf)->cgroup;
+	if (cgroup_parent(cgrp))
+		event_monitor = cgrp->cgrp_event_monitor;
+	else
+		event_monitor = &default_sli_event_monitor;
+
+	inode_lock_shared(file_inode(sf->file));
+	seq_printf(sf, "period: %d\n", event_monitor->period);
+	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
+
+	for (i = 0; i < MEM_LAT_STAT_NR; i++) {
+		threshold = sli_convert_value(event_monitor->memlat_threshold[i], true);
+		count = sli_convert_value(event_monitor->memlat_count[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_memlat_name(i),
+			   threshold, count);
+	}
+
+	inode_unlock_shared(file_inode(sf->file));
+	return 0;
+}
+
+int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long long threshold, count;
+	struct cgroup *cgrp;
+	struct sli_event_monitor *event_monitor;
+
+	cgrp = seq_css(sf)->cgroup;
+	if (cgroup_parent(cgrp))
+		event_monitor = cgrp->cgrp_event_monitor;
+	else
+		event_monitor = &default_sli_event_monitor;
+
+	inode_lock_shared(file_inode(sf->file));
+	seq_printf(sf, "period: %d\n", event_monitor->period);
+	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
+
+	for (i = 0; i < SCHEDLAT_STAT_NR; i++) {
+		threshold = sli_convert_value(event_monitor->schedlat_threshold[i], true);
+		count = sli_convert_value(event_monitor->schedlat_count[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_schedlat_name(i),
+			   threshold, count);
+	}
+
+	for (i = 0; i < SLI_LONGTERM_NR; i++) {
+		threshold = sli_convert_value(event_monitor->longterm_threshold[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu\n", get_longterm_name(i), threshold);
+	}
+
+	if (!cgroup_parent(cgrp)) {
+		for (i = 0; i < MEM_LAT_STAT_NR; i++) {
+			threshold = sli_convert_value(event_monitor->memlat_threshold[i], true);
+			count = sli_convert_value(event_monitor->memlat_count[i], true);
+
+			seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_memlat_name(i),
+				   threshold, count);
+		}
+		for (i = 0; i < IO_LAT_STAT_NR; i++) {
+			threshold = sli_convert_value(event_monitor->iolat_threshold[i], true);
+			count = sli_convert_value(event_monitor->iolat_count[i], true);
+
+			seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i),
+				   threshold, count);
+		}
+	}
+
+	inode_unlock_shared(file_inode(sf->file));
+	return 0;
+}
 int cgroup_sli_control_show(struct seq_file *sf, void *v)
 {
 	int i;
@@ -1080,6 +1400,7 @@ int cgroup_sli_control_show(struct seq_file *sf, void *v)
 	else
 		event_monitor = &default_sli_event_monitor;
 
+	inode_lock_shared(file_inode(sf->file));
 	seq_printf(sf, "period: %d\n", event_monitor->period);
 	seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
 
@@ -1105,6 +1426,14 @@ int cgroup_sli_control_show(struct seq_file *sf, void *v)
 		seq_printf(sf, "%s: threshold: %llu\n", get_longterm_name(i), threshold);
 	}
 
+	for (i = 0; i < IO_LAT_STAT_NR; i++) {
+		threshold = sli_convert_value(event_monitor->iolat_threshold[i], true);
+		count = sli_convert_value(event_monitor->iolat_count[i], true);
+
+		seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i),
+			   threshold, count);
+	}
+	inode_unlock_shared(file_inode(sf->file));
 	return 0;
 }
 
@@ -1330,6 +1659,7 @@ static ssize_t sli_enabled_write(struct file *file, const char __user *ubuf,
 		goto out;
 	}
 
+	inode_lock(file_inode(file));
 	switch (val) {
 	case '0':
 		if (static_key_enabled(&sli_enabled))
@@ -1342,6 +1672,7 @@ static ssize_t sli_enabled_write(struct file *file, const char __user *ubuf,
 	default:
 		ret = -EINVAL;
 	}
+	inode_unlock(file_inode(file));
 
 out:
 	return ret;
@@ -1355,15 +1686,73 @@ static const struct file_operations sli_enabled_fops = {
 	.release    = single_release,
 };
 
+static int sli_io_enabled_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", static_key_enabled(&sli_io_enabled));
+	return 0;
+}
+
+static int sli_io_enabled_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sli_io_enabled_show, NULL);
+}
+
+static ssize_t sli_io_enabled_write(struct file *file, const char __user *ubuf,
+				    size_t count, loff_t *ppos)
+{
+	char val = -1;
+	int ret = count;
+
+	if (count < 1 || *ppos) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (copy_from_user(&val, ubuf, 1)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	inode_lock(file_inode(file));
+	switch (val) {
+	case '0':
+		if (static_key_enabled(&sli_io_enabled))
+			static_branch_disable(&sli_io_enabled);
+		break;
+	case '1':
+		if (!static_key_enabled(&sli_io_enabled))
+			static_branch_enable(&sli_io_enabled);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	inode_unlock(file_inode(file));
+
+out:
+	return ret;
+}
+
+static const struct file_operations sli_io_enabled_fops = {
+	.open       = sli_io_enabled_open,
+	.read       = seq_read,
+	.write      = sli_io_enabled_write,
+	.llseek     = seq_lseek,
+	.release    = single_release,
+};
+
 int sli_cgroup_alloc(struct cgroup *cgroup)
 {
 	if (!cgroup_need_sli(cgroup))
 		return 0;
 
 	spin_lock_init(&cgroup->cgrp_mbuf_lock);
+	cgroup->sli_iolat_stat_percpu = alloc_percpu(struct sli_iolat_stat);
+	if (!cgroup->sli_iolat_stat_percpu)
+		goto out;
+
 	cgroup->sli_memlat_stat_percpu = alloc_percpu(struct sli_memlat_stat);
 	if (!cgroup->sli_memlat_stat_percpu)
-		goto out;
+		goto free_iolat_percpu;
 
 	cgroup->sli_schedlat_stat_percpu = alloc_percpu(struct sli_schedlat_stat);
 	if (!cgroup->sli_schedlat_stat_percpu)
@@ -1385,6 +1774,8 @@ int sli_cgroup_alloc(struct cgroup *cgroup)
 	free_percpu(cgroup->sli_schedlat_stat_percpu);
 free_memlat_percpu:
 	free_percpu(cgroup->sli_memlat_stat_percpu);
+free_iolat_percpu:
+	free_percpu(cgroup->sli_iolat_stat_percpu);
 out:
 	return -ENOMEM;
 }
@@ -1393,6 +1784,8 @@ void sli_cgroup_free(struct cgroup *cgroup)
 {
 	struct sli_event *event, *event_tmp;
 
+	if (cgroup->sctx)
+		sctx_free(cgroup);
 	/*
 	 * Cgroup's subsys would be cleared before sli_cgroup_free() had been called.
 	 * So we use !cgroup->cgrp_event_monitor instead of cgroup_need_sli to check
@@ -1401,6 +1794,7 @@ void sli_cgroup_free(struct cgroup *cgroup)
 	if (!cgroup->cgrp_event_monitor)
 		return;
 
+	free_percpu(cgroup->sli_iolat_stat_percpu);
 	free_percpu(cgroup->sli_memlat_stat_percpu);
 	free_percpu(cgroup->sli_schedlat_stat_percpu);
 	/* Free memory from the event list */
@@ -1422,6 +1816,7 @@ static int __init sli_proc_init(void)
 	}
 	proc_mkdir("sli", NULL);
 	proc_create("sli/sli_enabled", 0, NULL, &sli_enabled_fops);
+	proc_create("sli/sli_io_enabled", 0, NULL, &sli_io_enabled_fops);
 	return 0;
 }
 
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 0259cfdeff8379e76e406c79371232dca7bf343f..4f49ec1eb14fa3011361ac50e87972cfe6424642 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -411,14 +411,18 @@ static struct cftype files[] = {
 		.name = "uptime",
 		.seq_show = cpuacct_uptime_show,
 	},
+#ifdef CONFIG_RQM
 	{
 		.name = "mbuf",
 		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_mbuf_open,
 		.seq_show = cgroup_mbuf_show,
 		.seq_start = cgroup_mbuf_start,
 		.seq_next = cgroup_mbuf_next,
 		.seq_stop = cgroup_mbuf_stop,
+		.release = cgroup_mbuf_release,
 	},
+#endif
 #ifdef CONFIG_PSI
         {
                 .name = "cpu.pressure",
@@ -443,7 +447,7 @@ static struct cftype files[] = {
 	{
 		.name = "sli.control",
 		.write = cgroup_sli_control_write,
-		.seq_show = cgroup_sli_control_show,
+		.seq_show = cpuacct_cgroup_sli_control_show,
 	},
 	{
 		.name = "sli.monitor",
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ca9a6dedd786fd4d68bd744efc8992d8f261b9e8..d5c39baf26a96971d6205d8ffed614bebb02078c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1617,6 +1617,8 @@ static void priority_kill_process(struct task_struct *victim,
 	struct task_struct *p;
 	struct mm_struct *mm;
 	struct mem_cgroup *memcg;
+	unsigned long flags;
+	struct cgroup *cgrp;
 
 	p = find_lock_task_mm(victim);
 	if (!p) {
@@ -1637,8 +1639,11 @@ static void priority_kill_process(struct task_struct *victim,
 
 	/* Now we select [ victim ] to kill, just record it to mbuf */
 	memcg = mem_cgroup_from_task(victim);
-	mbuf_print(memcg->css.cgroup, "memqos: Killing process [ %s ] pid [ %d ] for memory reclaim",
+	cgrp = memcg->css.cgroup;
+	spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags);
+	mbuf_print(cgrp, "memqos: Killing process [ %s ] pid [ %d ] for memory reclaim",
                victim->comm, victim->pid);
+	spin_unlock_irqrestore(&cgrp->cgrp_mbuf_lock, flags);
 
 	/* Get a reference to safely compare mm after task_unlock(victim) */
 	mm = victim->mm;
@@ -7714,6 +7719,18 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.release = cgroup_pressure_release,
         },
 #endif
+#ifdef CONFIG_RQM
+	{
+		.name = "mbuf",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_mbuf_open,
+		.seq_show = cgroup_mbuf_show,
+		.seq_start = cgroup_mbuf_start,
+		.seq_next = cgroup_mbuf_next,
+		.seq_stop = cgroup_mbuf_stop,
+		.release = cgroup_mbuf_release,
+	},
+#endif
 #ifdef CONFIG_CGROUP_SLI
 	{
 		.name = "sli",
@@ -7725,6 +7742,22 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = mem_cgroup_sli_max_show,
 	},
+	{
+		.name = "sli.control",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write = cgroup_sli_control_write,
+		.seq_show = mem_cgroup_sli_control_show,
+	},
+	{
+		.name = "sli.monitor",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = cgroup_sli_monitor_open,
+		.seq_show = cgroup_sli_monitor_show,
+		.seq_start = cgroup_sli_monitor_start,
+		.seq_next = cgroup_sli_monitor_next,
+		.seq_stop = cgroup_sli_monitor_stop,
+		.poll = cgroup_sli_monitor_poll,
+	},
 #endif
 #ifdef CONFIG_EMM_MEMORY_RECLAIM
 	{
diff --git a/net/Kconfig b/net/Kconfig
index 1d4211802ebdd29393e465f2ccb9a9bae54016ea..d45927d9593bcf49a5819504b4aa9cc395988c87 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -99,6 +99,15 @@ source "net/netlabel/Kconfig"
 
 endif # if INET
 
+config NETNS_MBUF
+	bool "attach a mbuf to net namespace"
+	depends on RQM && INET && PROC_FS
+	default y
+	---help---
+	  this allows attach a mbuf to each net namespace.
+
+	  if you are unsure how to answer this question, answer N.
+
 config NETWORK_SECMARK
 	bool "Security Marking"
 	help
diff --git a/net/core/Makefile b/net/core/Makefile
index 518cdc0878c94f9df30625b281b15e6c62b8a277..73c483e230135ebdc69af29730d17e0cc29525c7 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o
 obj-$(CONFIG_GRO_CELLS) += gro_cells.o
 obj-$(CONFIG_FAILOVER) += failover.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
+obj-$(CONFIG_NETNS_MBUF) += netns_mbuf.o
diff --git a/net/core/netns_mbuf.c b/net/core/netns_mbuf.c
new file mode 100644
index 0000000000000000000000000000000000000000..81dee51f5c85f5035c8b21c3c2b59e6590988975
--- /dev/null
+++ b/net/core/netns_mbuf.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* make mbuf can be used by net namespace
+ *
+ * Author: mengensun <mengensun@tencent.com>
+ * Author: yuehongwu <yuehongwu@tencent.com>
+ * Copyright (C) 2024 Tencent, Inc
+ */
+#include<linux/cgroup.h>
+#include<linux/mbuf.h>
+#include<linux/proc_fs.h>
+
+#include<net/net_namespace.h>
+#include<net/netns/generic.h>
+
+extern int sysctl_qos_mbuf_enable;
+struct mbuf_seq_data {
+	struct seq_net_private snp;
+	struct mbuf_user_desc udesc;
+	struct mbuf_slot snapshot[];
+};
+
+static inline struct mbuf_slot *get_net_mbuf(struct net *net)
+{
+	return net->mbuf.slot;
+}
+
+/* not controlled by sysctl_qos_mbuf_enable because we will
+ * have a /proc/net/ipv4/netlat/enable in later patch
+ */
+ssize_t net_mbuf_print(struct net *net, const char *fmt, ...)
+{
+	va_list args;
+	struct mbuf_slot *slot;
+
+	slot = net->mbuf.slot;
+	if (!slot || !__ratelimit(&slot->ratelimit))
+		goto out;
+
+	va_start(args, fmt);
+	slot->ops->write(slot, fmt, args);
+	va_end(args);
+out:
+	return 0;
+}
+EXPORT_SYMBOL(net_mbuf_print);
+
+/* udesc is the user side interface, used to get data from mbuf,
+ * we can alloc a udesc per user, not to alloc a udesc and bind
+ * to mbuf when user accessing mbuf.
+ *
+ * seq file private data is the ideal place to hold the udesc
+ * if we put udesc in seq file private data all things is simple
+ */
+static void *netns_mbuf_start(struct seq_file *s, loff_t *pos)
+{
+	u32 index;
+	struct mbuf_user_desc *udesc;
+	struct mbuf_seq_data *pd;
+
+	pd = s->private;
+	udesc = &pd->udesc;
+	index = *pos;
+
+	/* why: see seq_mbuf_open */
+	if (!pd->snapshot->mring)
+		return NULL;
+
+	/* If already reach end, just return */
+	if (index && index == pd->snapshot->mring->next_idx)
+		return NULL;
+
+	udesc->user_idx = pd->snapshot->mring->first_idx;
+	udesc->user_seq = pd->snapshot->mring->first_seq;
+
+	/* Maybe reach end or empty */
+	if (udesc->user_idx == pd->snapshot->mring->next_idx)
+		return NULL;
+	return udesc;
+}
+
+static void *netns_mbuf_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct mbuf_seq_data *pd;
+	struct mbuf_user_desc *udesc = v;
+
+	pd = s->private;
+
+	/* why: see seq_mbuf_open */
+	if (!pd->snapshot->mring)
+		return NULL;
+
+	udesc->user_idx = pd->snapshot->ops->next(pd->snapshot->mring,
+			udesc->user_idx);
+	*pos = udesc->user_idx;
+	if (udesc->user_idx == pd->snapshot->mring->next_idx)
+		return NULL;
+
+	return udesc;
+}
+
+static void netns_mbuf_stop(struct seq_file *s, void *v) { }
+
+static int netns_mbuf_show(struct seq_file *s, void *v)
+{
+	ssize_t ret;
+	struct mbuf_seq_data *pd;
+	struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v;
+
+	pd = s->private;
+
+	/* why: see seq_mbuf_open */
+	if (!pd->snapshot->mring)
+		return 0;
+
+	memset(udesc->buf, 0, sizeof(udesc->buf));
+	ret = pd->snapshot->ops->read(pd->snapshot, udesc);
+	if (ret > 0)
+		seq_printf(s, "%s", udesc->buf);
+	return 0;
+}
+
+static int seq_mbuf_open(struct inode *inode, struct file *file)
+{
+	struct mbuf_seq_data *p;
+	struct mbuf_slot *mbuf;
+
+	p = seq_open_net_large_private(inode, file);
+
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	mbuf = get_net_mbuf(p->snp.net);
+	/* netns may have no mbuf attached, because the mbuf
+	 * pool has a max num
+	 * here we let file open success, so, seq_ops must
+	 * check mring point
+	 *
+	 * btw: we memzerod the private in
+	 * seq_open_net_large_private
+	 */
+	if (!mbuf)
+		return 0;
+
+	snapshot_mbuf(p->snapshot, mbuf, &mbuf->slot_lock);
+	return 0;
+}
+
+/* this function is token from seq_release_net, all is the
+ * same except for using **vfree** to free the private
+ */
+static int seq_mbuf_release(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq = f->private_data;
+
+	put_net(seq_file_net(seq));
+	vfree(seq->private);
+	seq->private = NULL;
+	seq_release(ino, f);
+	return 0;
+}
+
+/* when write clear the data */
+ssize_t seq_mbuf_write(struct file *f, const char __user *ubuf,
+		       size_t size, loff_t *_pos)
+{
+	struct seq_file *seq = f->private_data;
+	struct mbuf_seq_data *p;
+	struct mbuf_slot *mb;
+
+	p = seq->private;
+	mb = get_net_mbuf(p->snp.net);
+
+	/* the netns not attached mbuf */
+	if (!mb)
+		return size;
+
+	mbuf_reset(mb);
+	return size;
+}
+
+/* seq_read have a mutex lock hold when called thoes function
+ * while the mutex lock is bind to struct file, not to inode,
+ * that mutex lock can control mutex access to mbuf among tasks
+ * which have the same file object (eg: muti-threads of
+ * a process)
+ *
+ * if there are muti-process access the mbuf, there have no
+ * mutex accessing.
+ */
+static const struct seq_operations mbuf_seq_ops = {
+	.show = netns_mbuf_show,
+	.start = netns_mbuf_start,
+	.next = netns_mbuf_next,
+	.stop = netns_mbuf_stop,
+};
+
+static const struct file_operations mbuf_seq_fops = {
+	.open		= seq_mbuf_open,
+	.read		= seq_read,
+	.write		= seq_mbuf_write,
+	.llseek		= seq_lseek,
+	.release	= seq_mbuf_release,
+};
+
+extern struct proc_dir_entry *proc_create_net_data_ops(const char *name,
+                umode_t mode, struct proc_dir_entry *parent,
+                const struct seq_operations *seq_ops,
+                unsigned int state_size, void *data,
+                const struct file_operations *fops);
+
+static int __net_init net_mbuf_init(struct net *net)
+{
+	int ret = 0;
+
+	/* if mbuf alloc failed, make the netns create success
+	 *
+	 * returning error here will put a limit on max netns
+	 * can be created on current system
+	 *
+	 * btw: mbuf_slot has a max num 1024 for now, if mbuf_slot
+	 * is all used, more allocing may failed, what we can do
+	 * is make usr interface not changed, and make netlat
+	 * `speak nothing`
+	 * cgroup is used for kabi
+	 *
+	 * When the IPv4 protocol stack is initialized, the root netns
+	 * (init_net) needs a mbuf_slot, be sure that the
+	 * mbuf_slot can be allocated for the first time.
+	 */
+	if (sysctl_qos_mbuf_enable || net == &init_net) {
+		net->mbuf.slot = mbuf_slot_alloc_v2((void *)net, NULL);
+		if (!net->mbuf.slot)
+			pr_err("netns: fail alloc mbuf");
+	} else
+		net->mbuf.slot = NULL;
+
+	net->mbuf.twatcher = proc_net_mkdir(net, "twatcher", net->proc_net);
+	if (!net->mbuf.twatcher) {
+		ret = -ENOMEM;
+		goto free_mbuf;
+	}
+
+	net->mbuf.log = proc_create_net_data_ops("log", S_IFREG | 0644,
+						 net->mbuf.twatcher,
+						 &mbuf_seq_ops,
+						 sizeof(struct mbuf_seq_data) + get_mbuf_slot_len(),
+						 NULL, &mbuf_seq_fops);
+	if (!net->mbuf.log) {
+		ret = -ENOMEM;
+		goto remove_watcher;
+	}
+	return ret;
+
+remove_watcher:
+	remove_proc_entry("twatcher", net->proc_net);
+
+free_mbuf:
+	if (net->mbuf.slot)
+		mbuf_free_slot(net->mbuf.slot);
+	return ret;
+}
+
+static void __net_exit net_mbuf_exit(struct net *net)
+{
+	remove_proc_entry("log", net->mbuf.twatcher);
+	remove_proc_entry("twatcher", net->proc_net);
+
+	/* if mbuf allocate failed, no need to free */
+	if (!net->mbuf.slot)
+		return;
+	mbuf_free_slot(net->mbuf.slot);
+}
+
+static struct pernet_operations net_mbuf_ops = {
+	.init = net_mbuf_init,
+	.exit = net_mbuf_exit,
+};
+
+int  inet_mbuf_init(void)
+{
+	return register_pernet_subsys(&net_mbuf_ops);
+}
+EXPORT_SYMBOL(inet_mbuf_init);
+
+void inet_mbuf_exit(void)
+{
+	unregister_pernet_subsys(&net_mbuf_ops);
+}
+EXPORT_SYMBOL(inet_mbuf_exit);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index f5af8c6b2f87ecb53f55914f4044c9cff5bf1896..3d305a02fef8b92bcab0d970fae5ae413648f070 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -432,6 +432,18 @@ config INET_UDP_DIAG
 	  Support for UDP socket monitoring interface used by the ss tool.
 	  If unsure, say Y.
 
+config NETLAT
+	bool "INET: allow collect netlat info"
+	depends on NETNS_MBUF
+	default y
+	---help---
+	  enable some hook in net stack to collector some latency info. if
+	  the lentency is bigger then configured by user interface, then
+	  print msg to mbuf.
+
+	  if unsure, say N.
+
+
 config INET_RAW_DIAG
 	tristate "RAW: socket monitoring interface"
 	depends on INET_DIAG && (IPV6 || IPV6=n)
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9e1a186a3671e249f3d10b14497098772447267d..921a5c8186ef7f5aa6fc8a5d2e73402435f16bd4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
 obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_NETLAT) += netlat.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 36405511052abfc2c6a207445ce9ff8020732397..26d35bca543681ea558fa8aaa08c0665c92ace7f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -2075,6 +2075,10 @@ fs_initcall(inet_init);
 /* ------------------------------------------------------------------------ */
 
 #ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NETNS_MBUF
+extern int  inet_mbuf_init(void);
+extern void inet_mbuf_exit(void);
+#endif
 static int __init ipv4_proc_init(void)
 {
 	int rc = 0;
@@ -2087,11 +2091,19 @@ static int __init ipv4_proc_init(void)
 		goto out_udp;
 	if (ping_proc_init())
 		goto out_ping;
+#ifdef CONFIG_NETNS_MBUF
+	if (inet_mbuf_init())
+		goto out_mbuf;
+#endif
 	if (ip_misc_proc_init())
 		goto out_misc;
 out:
 	return rc;
 out_misc:
+#ifdef CONFIG_NETNS_MBUF
+	inet_mbuf_exit();
+out_mbuf:
+#endif
 	ping_proc_exit();
 out_ping:
 	udp4_proc_exit();
diff --git a/net/ipv4/netlat.c b/net/ipv4/netlat.c
new file mode 100644
index 0000000000000000000000000000000000000000..3984e85109683706f9092ff1f5b060c1a17d09b4
--- /dev/null
+++ b/net/ipv4/netlat.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Net Latency Monitor base on Quality Monitor Buffer
+ * Aim to provide net latency for a long running system
+ *
+ *      Author: mengensun <mengensun@tencent.com>
+ *      Author: yuehongwu <yuehongwu@tencent.com>
+ *      Copyright (C) 2024 Tencent, Inc
+ */
+
+#include<net/net_namespace.h>
+#include<net/tcp.h>
+#include<net/netns/generic.h>
+#include<net/netns_mbuf.h>
+#include "netlat.h"
+
+struct netlat_net_data {
+	int ack;
+	int pick;
+	int queue;
+	int enable;
+	unsigned long *ports;
+	struct ctl_table_header *netlat_hdr;
+};
+
+static unsigned int netlat_net_id __read_mostly;
+DEFINE_STATIC_KEY_FALSE(enable_netlat);
+
+static inline int get_ack_lat(struct net *net)
+{
+	struct netlat_net_data *pdata;
+
+	pdata = net_generic(net, netlat_net_id);
+	return pdata->ack;
+}
+
+static inline int get_pick_lat(struct net *net)
+{
+	struct netlat_net_data *pdata;
+
+	pdata = net_generic(net, netlat_net_id);
+	return pdata->pick;
+}
+
+static inline int get_queue_lat(struct net *net)
+{
+	struct netlat_net_data *pdata;
+
+	pdata = net_generic(net, netlat_net_id);
+	return pdata->queue;
+}
+
+static inline long *get_net_ports(struct net *net)
+{
+	struct netlat_net_data *pdata;
+
+	pdata = net_generic(net, netlat_net_id);
+	return pdata->ports;
+}
+
+static inline u32 get_rtxq_skb_jiffies(struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->first_xmit_time;
+}
+
+static inline void set_rtxq_skb_jiffies(struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->first_xmit_time = tcp_jiffies32;
+}
+
+/* sk is not used for now, but, may be used in the future
+ */
+void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst,
+			  struct sk_buff *src)
+{
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+	TCP_SKB_CB(dst)->first_xmit_time = TCP_SKB_CB(src)->first_xmit_time;
+}
+EXPORT_SYMBOL(netlat_copy_rtxq_skb);
+
+static inline u32 tcp_jiffies32_delt(struct sk_buff *skb)
+{
+	u32 j1, j2;
+
+	j1 = tcp_jiffies32;
+	j2 = get_rtxq_skb_jiffies(skb);
+
+	/* here leave a small time windows
+	 * when skb is alloced ack_num is inited to 0
+	 * if we do not touch the time stamp in ack_num
+	 * it is zero
+	 */
+	if (!j2)
+		return 0;
+
+	if (likely(j1 >= j2))
+		return j1 - j2;
+	/* when u32 is wrap around */
+	return U32_MAX - (j2 - j1) + 1;
+}
+
+/* sk is not used for now, but, may be used in the future
+ */
+void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb)
+{
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+	set_rtxq_skb_jiffies(skb);
+}
+EXPORT_SYMBOL(netlat_tcp_enrtxqueue);
+
+/* print msg to per net mbuf when ack latency is
+ * watched
+ */
+void netlat_ack_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct net *net;
+	s64 thresh;
+	s64 lat;
+	long *ports;
+
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+
+	net = sock_net(sk);
+
+	thresh = get_ack_lat(net);
+	if (!thresh)
+		return;
+
+	lat = tcp_jiffies32_delt(skb);
+	if (lat < thresh)
+		return;
+
+	ports = get_net_ports(net);
+	if (!test_bit(sk->sk_num, ports))
+		return;
+
+	net_mbuf_print(net, "TCP AC %u %pI4 %d %pI4 %d\n",
+		       (unsigned int)(jiffies_to_msecs(lat)),
+		       &sk->sk_rcv_saddr, (int)sk->sk_num,
+		       &sk->sk_daddr, (int)ntohs(sk->sk_dport));
+}
+EXPORT_SYMBOL(netlat_ack_check);
+
+/* netlat/enable only can be seen in root netns
+ *
+ * following three function must be called after lock
+ * the `lock` above we follow the following rule
+ *
+ * 1. when disable `enable`: if we have opened the
+ *    net_timestamp, closed it
+ *
+ * 2. when enable `enable`: if `pick/queue` need
+ *    net_timestamp, enabled it
+ *
+ * 3. when `pick/queue` are writing and need enable
+ *    net_timestamp and if `enable` disabled, just
+ *    say `i need net_timestamp` and do nothing leaveing
+ *    it to 2 above
+ *
+ * 4. when `pick/queue` are writing and need enable
+ *    net_timestamp and if `enable` enabled, just
+ *    enable net_timestamp by themself
+ */
+static struct mutex lock = __MUTEX_INITIALIZER(lock);
+static unsigned long need_time_stamp;
+
+/* for pick/queue write: see comment above */
+static void handle_net_timestamp(bool closed)
+{
+	/*!0->0*/
+	if (closed) {
+		need_time_stamp--;
+		if (need_time_stamp == 0 &&
+		    static_branch_unlikely(&enable_netlat))
+			net_disable_timestamp();
+		return;
+	}
+
+	/*0->!0*/
+	need_time_stamp++;
+	if (need_time_stamp == 1 &&
+	    static_branch_unlikely(&enable_netlat))
+		net_enable_timestamp();
+}
+
+/* for enable write: see comment above */
+static void handle_netlat_enable(bool closed)
+{
+	/*!0->0*/
+	if (closed) {
+		if (need_time_stamp)
+			net_disable_timestamp();
+		static_branch_disable(&enable_netlat);
+		return;
+	}
+
+	/*0->!0*/
+	if (need_time_stamp)
+		net_enable_timestamp();
+	static_branch_enable(&enable_netlat);
+}
+
+/* for netns exits: see comment above */
+static void handle_net_timestamp_exit(bool queue, bool pick)
+{
+	need_time_stamp -= queue;
+	need_time_stamp -= pick;
+
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+	/* if we dec the counter to zero and netlat enabled
+	 * disable the timestamp
+	 */
+	if (!need_time_stamp && (queue || pick))
+		net_disable_timestamp();
+}
+
+static int proc_do_netlat_pick(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int prev;
+	int ret;
+	struct netlat_net_data *pdata;
+
+	mutex_lock(&lock);
+
+	pdata = container_of(table->data, struct netlat_net_data, pick);
+	prev = pdata->pick;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	/* only change timestamp from 0->!0 or !0->0 */
+	if (!!prev == !!pdata->pick)
+		goto unlock;
+	handle_net_timestamp(!!prev);
+
+unlock:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static int proc_do_netlat_queue(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int prev;
+	int ret;
+	struct netlat_net_data *pdata;
+
+	mutex_lock(&lock);
+	pdata = container_of(table->data, struct netlat_net_data, queue);
+	prev = pdata->queue;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	/* only change timestamp from 0->!0 or !0->0 */
+	if (!!prev == !!pdata->queue)
+		goto unlock;
+	handle_net_timestamp(!!prev);
+
+unlock:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static int proc_do_netlat_enable(struct ctl_table *table, int write,
+				 void __user *buffer,
+				 size_t *lenp, loff_t *ppos)
+{
+	int prev;
+	int ret;
+	struct netlat_net_data *pdata;
+
+	mutex_lock(&lock);
+
+	pdata = container_of(table->data, struct netlat_net_data, enable);
+	prev = pdata->enable;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (!!prev == !!pdata->enable)
+		goto unlock;
+	handle_netlat_enable(!!prev);
+
+unlock:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct ctl_table ipv4_netlat[] = {
+	{
+		.procname	= "lports",
+		.data		= NULL,
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
+	{
+		.procname	= "ack",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+	{
+		.procname	= "queue",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_netlat_queue,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+	{
+		.procname	= "pick",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_netlat_pick,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+	{
+		.procname	= "enable",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_netlat_enable,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{}
+};
+
+static int netlat_init_ipv4_ctl_table(struct net *net)
+{
+	int ret;
+	struct netlat_net_data *pdata;
+	struct ctl_table *table;
+
+	table = ipv4_netlat;
+	pdata = net_generic(net, netlat_net_id);
+
+	ret = 0;
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(table, sizeof(ipv4_netlat), GFP_KERNEL);
+		if (!table) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		/* do not export enable to son netns */
+		memset(&table[4], 0, sizeof(struct ctl_table));
+	}
+
+	pdata->ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!pdata->ports) {
+		ret = -ENOMEM;
+		goto free_table;
+	}
+
+	table[0].data = &pdata->ports;
+	table[1].data = &pdata->ack;
+	table[2].data = &pdata->queue;
+	table[3].data = &pdata->pick;
+
+	/* do not export enable to son netns*/
+	if (net_eq(net, &init_net))
+		table[4].data = &pdata->enable;
+
+	pdata->netlat_hdr = register_net_sysctl(net, "net/ipv4/netlat", table);
+	if (!pdata->netlat_hdr) {
+		ret = -ENOMEM;
+		goto free_ports;
+	}
+	return ret;
+
+free_ports:
+	kfree(pdata->ports);
+free_table:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+out:
+	return ret;
+}
+
+static void netlat_exit_ipv4_ctl_table(struct net *net)
+{
+	struct netlat_net_data *pdata;
+	struct ctl_table *table;
+
+	pdata = net_generic(net, netlat_net_id);
+
+	table = pdata->netlat_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(pdata->netlat_hdr);
+
+	/* root netns never exit*/
+	if (net_eq(net, &init_net))
+		return;
+
+	mutex_lock(&lock);
+	handle_net_timestamp_exit(!!pdata->queue, !!pdata->pick);
+	mutex_unlock(&lock);
+
+	kfree(table);
+	kfree(pdata->ports);
+}
+
+/* print msg to per net mbuf when latency from
+ * netif to queued on tcp receive queue
+ */
+void netlat_queue_check(struct sock *sk, struct sk_buff *skb, int flags)
+{
+	struct net *net;
+	s64 lat;
+	int thresh;
+	long *ports;
+
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+
+	net = sock_net(sk);
+	if (!skb->tstamp)
+		return;
+
+	thresh = get_queue_lat(net);
+	if (!thresh)
+		return;
+
+	ports = get_net_ports(net);
+	if (!test_bit(sk->sk_num, ports))
+		return;
+
+	if (!skb->tstamp)
+		return;
+
+	lat = ktime_to_ms(net_timedelta(skb->tstamp));
+	lat = lat < 0 ? 0 : lat;
+	if (lat < thresh)
+		return;
+	if (flags & QUEUE_FLAG_RCV)
+		net_mbuf_print(net, "TCP QU %u %pI4 %d %pI4 %d\n",
+			       (unsigned int)lat,
+			       &sk->sk_rcv_saddr, (int)sk->sk_num,
+			       &sk->sk_daddr, (int)ntohs(sk->sk_dport));
+	else /* QUEUE_FLAG_OFO for now */
+		net_mbuf_print(net, "TCP OO %u %pI4 %d %pI4 %d\n",
+			       (unsigned int)lat,
+			       &sk->sk_rcv_saddr, (int)sk->sk_num,
+			       &sk->sk_daddr, (int)ntohs(sk->sk_dport));
+}
+EXPORT_SYMBOL(netlat_queue_check);
+
+/* print msg to per net mbuf when latency from
+ * netif to pick by usr app
+ */
+void netlat_pick_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct net *net;
+	s64 lat;
+	int thresh;
+	long *ports;
+
+	if (!static_branch_unlikely(&enable_netlat))
+		return;
+
+	net = sock_net(sk);
+	if (!skb->tstamp)
+		return;
+
+	thresh = get_pick_lat(net);
+	if (!thresh)
+		return;
+
+	ports = get_net_ports(net);
+	if (!test_bit(sk->sk_num, ports))
+		return;
+
+	if (!skb->tstamp)
+		return;
+
+	lat = ktime_to_ms(net_timedelta(skb->tstamp));
+	lat = lat < 0 ? 0 : lat;
+	if (lat < thresh)
+		return;
+
+	net_mbuf_print(net, "TCP PI %u %pI4 %d %pI4 %d\n",
+		       (unsigned int)lat, &sk->sk_rcv_saddr, (int)sk->sk_num,
+		       &sk->sk_daddr, (int)ntohs(sk->sk_dport));
+}
+EXPORT_SYMBOL(netlat_pick_check);
+
+static struct pernet_operations netlat_net_ops = {
+	.init = netlat_init_ipv4_ctl_table,
+	.exit = netlat_exit_ipv4_ctl_table,
+	.id   = &netlat_net_id,
+	.size = sizeof(struct netlat_net_data),
+};
+
+/* add some config file in proc
+ */
+int  netlat_net_init(void)
+{
+	return register_pernet_subsys(&netlat_net_ops);
+}
+EXPORT_SYMBOL(netlat_net_init);
+
+void netlat_net_exit(void)
+{
+	unregister_pernet_subsys(&netlat_net_ops);
+}
+EXPORT_SYMBOL(netlat_net_exit);
diff --git a/net/ipv4/netlat.h b/net/ipv4/netlat.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0a59c5585a0407470da1f44cc7cf50cd156f7c1
--- /dev/null
+++ b/net/ipv4/netlat.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * add a netlat to monitor tcp -package latency
+ *
+ * Author: mengensun <mengensun@tencent.com>
+ * Copyright (C) 2024 Tencent, Inc
+ */
+
+#ifndef H______NETLAT
+#define H______NETLAT
+
+#ifdef CONFIG_NETLAT
+
+#define QUEUE_FLAG_OFO 0x1
+#define QUEUE_FLAG_RCV 0x2
+
+int netlat_net_init(void);
+void netlat_net_exit(void);
+void netlat_ack_check(struct sock *sk, struct sk_buff *skb);
+void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src);
+void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb);
+#define netlat_check(oldest, sk, skb) \
+do { \
+	if (oldest) { \
+		netlat_ack_check(sk, skb); \
+		oldest = false; \
+	} \
+} while (0)
+
+void netlat_queue_check(struct sock *sk, struct sk_buff *skb, int flags);
+void netlat_pick_check(struct sock *sk, struct sk_buff *skb);
+
+#else /* CONFIG_NETLAT */
+static __always_inline int netlat_net_init(void) { return 0; };
+static __always_inline void netlat_net_exit(void) { };
+static __always_inline void netlat_ack_check(struct sock *sk,
+					     struct sk_buff *skb) { };
+static __always_inline void netlat_copy_rtxq_skb(struct sock *sk,
+						 struct sk_buff *dst,
+						 struct sk_buff *src) { };
+static __always_inline void netlat_tcp_enrtxqueue(struct sock *sk,
+						  struct sk_buff *skb) { };
+#define netlat_check(oldest, sk, skb)
+
+#define QUEUE_FLAG_OFO 0x1
+#define QUEUE_FLAG_RCV 0x2
+#define netlat_queue_check(sk, skb, flags)
+
+#define netlat_pick_check(sk, skb)
+#endif /* !CONFIG_NETLAT */
+#endif
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d1253a5359f347e9c759bc262adac28551ebf7a5..8732a1712bf81ef5471069822c687ef7073128a6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1385,6 +1385,14 @@ static struct ctl_table ipv4_net_table[] = {
 	{ }
 };
 
+#ifdef CONFIG_NETLAT
+extern int  netlat_net_init(void);
+/*
+ * this is not used for now, but sameone may used laterly
+ * just put here
+ */
+extern int  netlat_net_exit(void);
+#endif
 static __net_init int ipv4_sysctl_init_net(struct net *net)
 {
 	struct ctl_table *table;
@@ -1449,6 +1457,23 @@ static __init int sysctl_ipv4_init(void)
 		return -ENOMEM;
 	}
 
+#ifdef CONFIG_NETLAT
+	/*
+	 * this must after the register of ipv4_sysctl_ops
+	 * because we are on the sub-tree of "net/ipv4"
+	 * setup_net call init one by one, while cleanup_net
+	 * call init one by one in reversed order
+	 */
+	if (netlat_net_init()) {
+		unregister_pernet_subsys(&ipv4_sysctl_ops);
+		unregister_net_sysctl_table(hdr);
+		return -ENOMEM;
+	}
+#endif
+	/*
+	 * btw: if someone adding some code here, do not
+	 * forget the !!netlat_net_exit!! function
+	 */
 	return 0;
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4063356a0c406add91a1c1c3c7ad947fc8e66355..9b60cdaa8d9333156a69dceee7c7bc72aaa40bf1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1953,6 +1953,9 @@ static int tcp_inq_hint(struct sock *sk)
 	return inq;
 }
 
+#ifdef CONFIG_NETLAT
+extern void netlat_pick_check(struct sock *sk, struct sk_buff *skb);
+#endif
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -2179,15 +2182,23 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			goto found_fin_ok;
-		if (!(flags & MSG_PEEK))
+		if (!(flags & MSG_PEEK)) {
+#ifdef CONFIG_NETLAT
+			netlat_pick_check(sk, skb);
+#endif
 			sk_eat_skb(sk, skb);
+		}
 		continue;
 
 found_fin_ok:
 		/* Process the FIN. */
 		WRITE_ONCE(*seq, *seq + 1);
-		if (!(flags & MSG_PEEK))
+		if (!(flags & MSG_PEEK)) {
+#ifdef CONFIG_NETLAT
+			netlat_pick_check(sk, skb);
+#endif
 			sk_eat_skb(sk, skb);
+		}
 		break;
 	} while (len > 0);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9a2fd0cd68a245b7d7c58f8f5026f5338cfed19e..014b9e8bc4c4ccd206e6c4171b245c6f250f11ab 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3103,6 +3103,10 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 	}
 }
 
+#ifdef CONFIG_NETLAT
+extern void netlat_ack_check(struct sock *sk, struct sk_buff *skb);
+extern void netlat_queue_check(struct sock *sk, struct sk_buff *skb);
+#endif
 /* Remove acknowledged frames from the retransmission queue. If our packet
  * is before the ack sequence we can discard it as it's confirmed to have
  * arrived at the other end.
@@ -3124,6 +3128,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 	u32 pkts_acked = 0;
 	u32 last_in_flight = 0;
 	bool rtt_update;
+	bool __maybe_unused netlat_oldest = true;
 	int flag = 0;
 
 	first_ackt = 0;
@@ -3205,6 +3210,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		if (unlikely(skb == tp->lost_skb_hint))
 			tp->lost_skb_hint = NULL;
 		tcp_highest_sack_replace(sk, skb, next);
+#ifdef CONFIG_NETLAT
+		/*
+		 * here in for! we have make the ts of skb in rtx queue
+		 * Monotonically incremental with the seq_num of skb,so
+		 * here we can only report the oldest skb's latency.
+		 *
+		 * btw: the oldest skb's latency is the max latency see
+		 * by this function
+		 */
+		if (netlat_oldest) {
+			netlat_ack_check(sk, skb);
+			netlat_oldest = false;
+		}
+#endif
 		tcp_rtx_queue_unlink_and_free(skb, sk);
 	}
 
@@ -4764,6 +4783,10 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
 	int eaten;
 	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
 
+#ifdef CONFIG_NETLAT
+	netlat_queue_check(sk, skb);
+#endif
+
 	eaten = (tail &&
 		 tcp_try_coalesce(sk, tail,
 				  skb, fragstolen)) ? 1 : 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b8dd6d7cc91e00563db953d583d287f226b41a0c..c823f4ac218bc3a9ec0d66f1ba0faf83377efca5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -48,6 +48,11 @@
 
 #include <trace/events/tcp.h>
 
+#ifdef CONFIG_NETLAT
+extern void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src);
+extern void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb);
+#endif
+
 /* Refresh clocks of a TCP socket,
  * ensuring monotically increasing values.
  */
@@ -72,6 +77,12 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
 
 	__skb_unlink(skb, &sk->sk_write_queue);
+#ifdef CONFIG_NETLAT
+	/*
+	 * for the tcp in established status, and normal data skb
+	 */
+	netlat_tcp_enrtxqueue(sk, skb);
+#endif
 	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
 
 	if (tp->highest_sack == NULL)
@@ -1505,9 +1516,28 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
 	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
-	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
+	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) {
+#ifdef CONFIG_NETLAT
+		/*
+		 * for skb in rtx queue and be splitted:
+		 *
+		 * eg: we receive an ack, the ack only partially
+		 * acked an skb in rtx queue, we need split the
+		 * partially acked skb, release the acked bytes
+		 * and collect the remained bytes to `buff`, insert
+		 * buff to rtx queue again
+		 * some other condition like:
+		 * partially sacked a skb in rtx queue
+		 * partially dsacked a skb in rtx queue
+		 * ....
+		 * here we should copy the origin skb's ts to the
+		 * new one(buff)
+		 */
+		netlat_copy_rtxq_skb(sk, buff, skb);
+#endif
 		list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
+	}
 	return 0;
 }
 
@@ -3386,6 +3416,14 @@ int tcp_send_synack(struct sock *sk)
 			tcp_highest_sack_replace(sk, skb, nskb);
 			tcp_rtx_queue_unlink_and_free(skb, sk);
 			__skb_header_release(nskb);
+#ifdef CONFIG_NETLAT
+			/*
+			 * for crossed SYN-ACK, eg: we are in
+			 * syn-send status, and received a pure
+			 * syn skb from the peer
+			 */
+			netlat_tcp_enrtxqueue(sk, nskb);
+#endif
 			tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
 			sk_wmem_queued_add(sk, nskb->truesize);
 			sk_mem_charge(sk, nskb->truesize);
@@ -3694,6 +3732,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
 	if (!err) {
 		tp->syn_data = (fo->copied > 0);
+#ifdef CONFIG_NETLAT
+		/*
+		 * for fastopen sock which send data in the syn skb
+		 */
+		netlat_tcp_enrtxqueue(sk, syn_data);
+#endif
 		tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
 		goto done;
@@ -3745,6 +3789,10 @@ int tcp_connect(struct sock *sk)
 	tp->retrans_stamp = tcp_time_stamp(tp);
 	tcp_connect_queue_skb(sk, buff);
 	tcp_ecn_send_syn(sk, buff);
+#ifdef CONFIG_NETLAT
+	/* for the syn package */
+	netlat_tcp_enrtxqueue(sk, buff);
+#endif
 	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
 
 	/* Send off SYN; include data in Fast Open. */