diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 46e955d076b5b4a03133b2e66b9ec428ba08c349..47fd6e0b6835949af3e3f298df2f7061bd2127f9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1620,6 +1620,26 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) return 0; } +#ifdef CONFIG_CGROUP_SLI +static int io_cgroup_sli_max_show(struct seq_file *m, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(m)); + struct cgroup *cgrp; + cgrp = blkcg->css.cgroup; + + return sli_iolat_max_show(m, cgrp); +} + +static int io_cgroup_sli_show(struct seq_file *m, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(m)); + struct cgroup *cgrp; + cgrp = blkcg->css.cgroup; + + return sli_iolat_stat_show(m, cgrp); +} +#endif + static struct cftype blkcg_files[] = { { .name = "stat", @@ -1643,6 +1663,46 @@ static struct cftype blkcg_legacy_files[] = { .name = "diskstats_recursive", .seq_show = blkcg_dkstats_recursive_show, }, +#ifdef CONFIG_RQM + { + .name = "mbuf", + .open = cgroup_mbuf_open, + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_mbuf_show, + .seq_start = cgroup_mbuf_start, + .seq_next = cgroup_mbuf_next, + .seq_stop = cgroup_mbuf_stop, + .release = cgroup_mbuf_release, + }, +#endif +#ifdef CONFIG_CGROUP_SLI + { + .name = "sli", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = io_cgroup_sli_show, + }, + { + .name = "sli_max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = io_cgroup_sli_max_show, + }, + { + .name = "sli.control", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_sli_control_write, + .seq_show = io_cgroup_sli_control_show, + }, + { + .name = "sli.monitor", + .flags = CFTYPE_NOT_ON_ROOT, + .open = cgroup_sli_monitor_open, + .seq_show = cgroup_sli_monitor_show, + .seq_start = cgroup_sli_monitor_start, + .seq_next = cgroup_sli_monitor_next, + .seq_stop = cgroup_sli_monitor_stop, + .poll = cgroup_sli_monitor_poll, + }, +#endif { } /* terminate */ }; diff --git a/block/blk-core.c b/block/blk-core.c index 6856b49ed1186a44232e7b021a0d2cb62b941a8d..df2028efb3d40b4e935385ff5915e0a19641cf83 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -38,6 +38,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -86,6 +87,31 @@ static void blkcg_stat_acct(struct blkcg *blkcg, struct request *req, int new_io } } +#ifdef CONFIG_CGROUP_SLI +static void sli_iolat_stat_end_check(u64 rq_alloc_time_ns, u64 rq_io_start_time_ns, + struct bio *bio, struct blkcg *blkcg) +{ + struct cgroup *cgrp; + u64 sli_iolat_end_time = 0; + u64 bio_start = bio_issue_time(&bio->bi_issue); + + if (!bio_start || !rq_alloc_time_ns || !rq_io_start_time_ns || !blkcg || + blkcg == &blkcg_root) + return; + + cgrp = blkcg->css.cgroup; + if (!cgrp || !cgroup_parent(cgrp)) + return; + + sli_iolat_end_time = __bio_issue_time(ktime_get_ns()); + if (sli_iolat_end_time <= bio_start) + return; + + sli_iolat_stat_end(IO_LAT_DELAY, bio_start, rq_alloc_time_ns, rq_io_start_time_ns, + sli_iolat_end_time, sli_iolat_end_time - bio_start, bio, cgrp); +} +#endif + void blkcg_account_io_completion(struct request *req, struct bio *bio, unsigned int bytes) { @@ -95,6 +121,12 @@ void blkcg_account_io_completion(struct request *req, struct bio *bio, struct hd_struct *part; int cpu; +#ifdef CONFIG_CGROUP_SLI + if (static_branch_unlikely(&sli_io_enabled)) + sli_iolat_stat_end_check(req->alloc_time_ns, req->io_start_time_ns, + bio, blkcg); +#endif + cpu = part_stat_lock(); part = req->part; blkcg_part_stat_add(blkcg, cpu, part, sectors[rw], bytes >> 9); diff --git a/block/blk-mq.c b/block/blk-mq.c index b68fc393d78c9a34ac5038248cdd55f6d9a551bd..f13cbf9e916145bfcecc51d3a97486c881398551 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -326,7 +327,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME +#if defined(CONFIG_BLK_RQ_ALLOC_TIME) || defined(CONFIG_CGROUP_SLI) rq->alloc_time_ns = alloc_time_ns; #endif if (blk_mq_need_time_stamp(rq)) @@ -368,7 +369,10 @@ static struct request *blk_mq_get_request(struct request_queue *q, /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = ktime_get_ns(); - +#ifdef CONFIG_CGROUP_SLI + else if (static_branch_unlikely(&sli_io_enabled)) + alloc_time_ns = ktime_get_ns(); +#endif data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index be90c3cf67d0f8aed02237e149f7b5f882b7a7b1..f89b93f62fd14a80d7816173de028a73c406b93b 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -65,6 +65,61 @@ static int seq_open_net(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_RQM +/* token from seq_open_net, all is same except the private is + * alloc by vmalloc, why? + * + * sameone may need a big private, wasting continuous phy mem + * they can use this function to use vmalloc private + * + * from now if you using this open abi place write a write + * fops like proc_simple_write we delete the pde->write check + */ +void *seq_open_net_large_private(struct inode *inode, struct file *file) +{ + struct net *net; + struct seq_file *seq; + struct seq_net_private *p; + int ret; + unsigned int state_size = PDE(inode)->state_size; + + WARN_ON_ONCE(state_size < sizeof(struct seq_net_private)); + + net = get_proc_net(inode); + if (!net) { + ret = -ENXIO; + goto out; + } + + p = vmalloc(state_size); + if (!p) { + ret = -ENOMEM; + goto put_out; + } + memset(p, 0, state_size); + + ret = seq_open(file, PDE(inode)->seq_ops); + if (ret < 0) + goto free_out; + + seq = file->private_data; + seq->private = (void *)p; + +#ifdef CONFIG_NET_NS + p->net = net; +#endif + return p; + +free_out: + vfree(p); +put_out: + put_net(net); +out: + return ERR_PTR(ret); +} +EXPORT_SYMBOL(seq_open_net_large_private); +#endif + static int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; @@ -118,6 +173,30 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(proc_create_net_data); +#ifdef CONFIG_RQM +/* add a ext-abi to allow someone define the fops by themself, this is all + * alike proc_create_net_data except has a extra f_ops parameter + */ +struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *seq_ops, + unsigned int state_size, void *data, + const struct file_operations *f_ops) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + + pde_force_lookup(p); + p->proc_fops = f_ops; + p->seq_ops = seq_ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_data_ops); +#endif + /** * proc_create_net_data_write - Create a writable net_ns-specific proc file * @name: The name of the file. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 875f5433b6369a26f5a11c79b702dc735c49c85b..24b101a246e962c00408d67fdc0fa99182233204 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -203,7 +203,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME +#if defined(CONFIG_BLK_RQ_ALLOC_TIME) || defined(CONFIG_CGROUP_SLI) /* Time that the first bio started allocating this request. */ u64 alloc_time_ns; #endif diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ca005a23ce3ec2fa4bf0551243f7165d7f40602c..ca6fde2c3cfdf4d7ceb0eaa41d01c9a5830582e3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -522,6 +522,9 @@ struct cgroup { /* sched latency stat */ struct sli_schedlat_stat __percpu *sli_schedlat_stat_percpu; + /* io latency stat */ + struct sli_iolat_stat __percpu *sli_iolat_stat_percpu; + /* proactive event monitoring structure for cgroup */ struct sli_event_monitor *cgrp_event_monitor; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4b3345b37c0452f71be2a4eeb7dbcfb6ac4a46fe..fea0909b2e443ef9362d521f61c5d3bb9ed8e01e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -1009,12 +1009,13 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} ssize_t cgroup_priority(struct cgroup_subsys_state *css); struct cgroup *get_cgroup_from_task(struct task_struct *task); -ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...); ssize_t mbuf_print_task(struct task_struct *task, const char *fmt, ...); void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos); void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos); void cgroup_mbuf_stop(struct seq_file *s, void *v); int cgroup_mbuf_show(struct seq_file *s, void *v); +int cgroup_mbuf_open(struct kernfs_open_file *of); +void cgroup_mbuf_release(struct kernfs_open_file *of); int cgroup_sli_monitor_open(struct kernfs_open_file *of); void *cgroup_sli_monitor_start(struct seq_file *s, loff_t *pos); diff --git a/include/linux/mbuf.h b/include/linux/mbuf.h index 32779eee2ef569bdd7fc8fd8ffa58e402bb5ddad..7f3705923f86e981bb14db98c789db000b1f941d 100644 --- a/include/linux/mbuf.h +++ b/include/linux/mbuf.h @@ -1,13 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2021 bauerchen + * Copyright (C) 2024 mengensun */ #ifndef _CGROUP_MBUF_H #define _CGROUP_MBUF_H -#include -#include +#include struct mbuf_struct { u32 mbuf_len; @@ -47,28 +46,37 @@ struct mbuf_user_desc { /* each cgroup has a mbuf_slot struct */ struct mbuf_slot { u32 idx; - /* write op must hold this lock */ - spinlock_t slot_lock; + /* snapshot/write op must hold this lock */ + seqlock_t slot_lock; /* rate limit */ struct ratelimit_state ratelimit; - struct cgroup *owner; + void *owner; const struct mbuf_operations *ops; struct mbuf_ring *mring; - struct mbuf_user_desc *udesc; }; struct mbuf_operations { /* read message */ - ssize_t (*read) (struct mbuf_slot *, struct mbuf_user_desc *); + ssize_t (*read)(struct mbuf_slot *_slot, struct mbuf_user_desc *udest); + /* get next available idx */ - u32 (*next) (struct mbuf_ring *, u32); + u32 (*next)(struct mbuf_ring *mring, u32 idx); + /* write message */ - ssize_t (*write) (struct cgroup *, const char *, va_list); + ssize_t (*write)(struct mbuf_slot *mbuf, const char *fmt, va_list args); } ____cacheline_aligned; void __init mbuf_bmap_init(void); void __init setup_mbuf(void); + struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg); +struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops); void mbuf_free(struct cgroup *cg); + +ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...); +void snapshot_mbuf(struct mbuf_slot *, struct mbuf_slot*, seqlock_t *); +u32 get_mbuf_slot_len(void); +void mbuf_free_slot(struct mbuf_slot *slot); +void mbuf_reset(struct mbuf_slot *mbuf); #endif diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index d22c1c7fa774b9c35c8aeafcab687d47cfb44445..8b9cb125a10ed5e7f8ecb68b415edef65ef44c19 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -81,6 +81,15 @@ extern struct pid *tgid_pidfd_to_pid(const struct file *file); extern struct net init_net; extern struct list_head sysctl_restrict_list; +#ifdef CONFIG_NETNS_MBUF +void *seq_open_net_large_private(struct inode *inode, struct file *file); +struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct seq_operations *seq_ops, + unsigned int state_size, void *data, + const struct file_operations *proc_ops); +#endif + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must diff --git a/include/linux/sli.h b/include/linux/sli.h index 32c901b87e9b831b1c23ddfc19f1ce9686f0f2b7..963864f366e528099faadb64924092b23fae2df4 100755 --- a/include/linux/sli.h +++ b/include/linux/sli.h @@ -37,6 +37,11 @@ enum sli_schedlat_stat_item { SCHEDLAT_STAT_NR }; +enum sli_iolat_stat_item { + IO_LAT_DELAY, + IO_LAT_STAT_NR +}; + struct sli_memlat_stat { unsigned long latency_max[MEM_LAT_STAT_NR]; unsigned long item[MEM_LAT_STAT_NR][LAT_COUNT_NR]; @@ -47,10 +52,16 @@ struct sli_schedlat_stat { unsigned long item[SCHEDLAT_STAT_NR][LAT_COUNT_NR]; }; +struct sli_iolat_stat { + unsigned long latency_max[IO_LAT_STAT_NR]; + unsigned long item[IO_LAT_STAT_NR][LAT_COUNT_NR]; +}; + enum sli_event_type { SLI_SCHED_EVENT, SLI_MEM_EVENT, SLI_LONGTERM_EVENT, + SLI_IO_EVENT, SLI_EVENT_NR }; @@ -100,6 +111,10 @@ struct sli_event_monitor { unsigned long long longterm_threshold[SLI_LONGTERM_NR]; atomic_long_t longterm_statistics[SLI_LONGTERM_NR]; + unsigned long long iolat_threshold[IO_LAT_STAT_NR]; + unsigned long long iolat_count[IO_LAT_STAT_NR]; + atomic_long_t iolat_statistics[IO_LAT_STAT_NR]; + KABI_RESERVE(1); KABI_RESERVE(2); }; @@ -129,9 +144,17 @@ void sli_schedlat_stat(struct task_struct *task,enum sli_schedlat_stat_item sidx void sli_schedlat_rundelay(struct task_struct *task, struct task_struct *prev, u64 delta); int sli_schedlat_stat_show(struct seq_file *m, struct cgroup *cgrp); int sli_schedlat_max_show(struct seq_file *m, struct cgroup *cgrp); +void sli_iolat_stat_end(enum sli_iolat_stat_item sidx, u64 bio_start, u64 rq_alloc_time_ns, + u64 rq_io_start_time_ns, u64 sli_iolat_end_time, u64 duration, struct bio *bio, + struct cgroup *cgrp); +int sli_iolat_max_show(struct seq_file *m, struct cgroup *cgrp); +int sli_iolat_stat_show(struct seq_file *m, struct cgroup *cgrp); ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cgroup_sli_control_show(struct seq_file *sf, void *v); +int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v); +int mem_cgroup_sli_control_show(struct seq_file *sf, void *v); +int io_cgroup_sli_control_show(struct seq_file *sf, void *v); void sli_check_longsys(struct task_struct *tsk); void sli_update_tick(struct task_struct *tsk); @@ -145,5 +168,6 @@ void sli_monitor_stop(struct seq_file *seq, void *v); __poll_t sli_monitor_poll(struct kernfs_open_file *of, poll_table *pt); int sli_event_add(struct sli_notify_event *notify_event, u32 event_type, u32 levent, u32 count); u32 sli_monitor_signal(struct cgroup *cgrp, struct sli_notify_event *notify_event); +DECLARE_STATIC_KEY_FALSE(sli_io_enabled); #endif /*_LINUX_SLI_H*/ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 539972f4325cdc73e03b97dc7db47c1552700ba6..de8123fea2330a0345162daceb8286eb61f2b201 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -38,6 +38,9 @@ #include #include #include +#ifdef CONFIG_NETNS_MBUF +#include +#endif struct user_namespace; struct proc_dir_entry; @@ -190,6 +193,9 @@ struct net { struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; +#ifdef CONFIG_NETNS_MBUF + struct net_mbuf mbuf; +#endif } __randomize_layout; #include diff --git a/include/net/netns_mbuf.h b/include/net/netns_mbuf.h new file mode 100644 index 0000000000000000000000000000000000000000..6a272949c4a43a3df50bdafa9823a90ace330676 --- /dev/null +++ b/include/net/netns_mbuf.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * + * make mbuf can be used by net namespace + * + * Author: mengensun + * Copyright (C) 2024 Tencent, Inc + */ +#ifndef __NETNS_MBUF +#define __NETNS_MBUF + +#include +#include + +#ifdef CONFIG_NETNS_MBUF +struct net_mbuf { + struct proc_dir_entry *twatcher; + struct proc_dir_entry *log; + struct mbuf_slot *slot; +}; + +int inet_mbuf_init(void); +void inet_mbuf_exit(void); +ssize_t net_mbuf_print(struct net *net, const char *fmt, ...); +#else +static __always_inline int inet_mbuf_init(void) {return 0; } +static __always_inline void inet_mbuf_exit(void) {} +static __always_inline ssize_t net_mbuf_print(struct net *net, const char *fmt, ...) {return 0; }; +#endif +#endif diff --git a/include/net/tcp.h b/include/net/tcp.h index d7660fda7bd436c144a6c4147259e74bdd0cc291..63cbb6af9d2dc45e8afa87181cf370493c1cf27f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -879,6 +879,7 @@ struct tcp_skb_cb { has_rxtstamp:1, /* SKB has a RX timestamp */ unused:5; __u32 ack_seq; /* Sequence number ACK'd */ + __u32 first_xmit_time; union { struct { /* There is space for up to 24 bytes */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f925f708e248b6f77baeb9c34bca53d197961027..9ec853feba300370e536d3b85332a40fce247ba2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3937,6 +3937,13 @@ static int cgroup_net_quality_show(struct seq_file *seq, void *v) #endif #ifdef CONFIG_CGROUP_SLI +static int cgroup_sli_io_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgroup = seq_css(seq)->cgroup; + + return sli_iolat_stat_show(seq, cgroup); +} + static int cgroup_sli_memory_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; @@ -3957,17 +3964,21 @@ static int cgroup_sli_max_show(struct seq_file *seq, void *v) struct cgroup *cgroup = seq_css(seq)->cgroup; sli_schedlat_max_show(seq, cgroup); - return sli_memlat_max_show(seq, cgroup); + sli_memlat_max_show(seq, cgroup); + return sli_iolat_max_show(seq, cgroup); } #endif void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos) { - struct cgroup *cgrp = seq_css(s)->cgroup; - struct mbuf_slot *mb = cgrp->mbuf; u32 index; + struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; + struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist; + struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)ctx->psi.trigger; - if (!mb) + /* why: see cgroup_mbuf_open */ + if (!mb->mring) return NULL; index = *pos; @@ -3975,29 +3986,26 @@ void *cgroup_mbuf_start(struct seq_file *s, loff_t *pos) if (index && index == mb->mring->next_idx) return NULL; - if (!mb->udesc) { - mb->udesc = kmalloc(sizeof(struct mbuf_user_desc), GFP_KERNEL); - - if (!mb->udesc) - goto out; - - mb->udesc->user_idx = mb->mring->first_idx; - mb->udesc->user_seq = mb->mring->first_seq; - } + udesc->user_idx = mb->mring->first_idx; + udesc->user_seq = mb->mring->first_seq; /* Maybe reach end or empty */ - if (mb->udesc->user_idx == mb->mring->next_idx) + if (udesc->user_idx == mb->mring->next_idx) return NULL; -out: - return mb->udesc; + return udesc; } void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos) { struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v; - struct cgroup *cgrp = seq_css(s)->cgroup; - struct mbuf_slot *mb = cgrp->mbuf; + struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; + struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist; + + /* why: see cgroup_mbuf_open */ + if (!mb->mring) + return NULL; udesc->user_idx = mb->ops->next(mb->mring, udesc->user_idx); *pos = udesc->user_idx; @@ -4008,26 +4016,19 @@ void *cgroup_mbuf_next(struct seq_file *s, void *v, loff_t *pos) return udesc; } -void cgroup_mbuf_stop(struct seq_file *s, void *v) -{ - struct cgroup *cgrp = seq_css(s)->cgroup; - struct mbuf_user_desc *desc; - - if (cgrp->mbuf) { - desc = cgrp->mbuf->udesc; - if(desc && desc->user_idx == cgrp->mbuf->mring->next_idx) { - kfree(cgrp->mbuf->udesc); - cgrp->mbuf->udesc = NULL; - } - } -} +void cgroup_mbuf_stop(struct seq_file *s, void *v) { } int cgroup_mbuf_show(struct seq_file *s, void *v) { ssize_t ret; struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v; - struct cgroup *cgrp = seq_css(s)->cgroup; - struct mbuf_slot *mb = cgrp->mbuf; + struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; + struct mbuf_slot *mb = (struct mbuf_slot *)ctx->procs1.pidlist; + + /* why: see cgroup_mbuf_open */ + if (!mb->mring) + return 0; memset(udesc->buf, 0, sizeof(udesc->buf)); ret = mb->ops->read(mb, udesc); @@ -4038,6 +4039,56 @@ int cgroup_mbuf_show(struct seq_file *s, void *v) return 0; } +extern u32 get_mbuf_slot_len(void); +int cgroup_mbuf_open(struct kernfs_open_file *of) +{ + struct cgroup_file_ctx *ctx = of->priv; + struct mbuf_slot *mb = seq_css(of->seq_file)->cgroup->mbuf; + u32 mbuf_slot_len; + + /* use ctx->psi.trigger for mbuf_user_desc */ + ctx->psi.trigger = kzalloc(sizeof(struct mbuf_user_desc), GFP_KERNEL); + if (!ctx->psi.trigger) + return -ENOMEM; + + mbuf_slot_len = get_mbuf_slot_len(); + /* use ctx->procs1.pidlist for mbuf_slot snapshot */ + ctx->procs1.pidlist = vmalloc(mbuf_slot_len); + if (!ctx->procs1.pidlist) { + kfree(ctx->psi.trigger); + ctx->psi.trigger = NULL; + return -ENOMEM; + } + memset(ctx->procs1.pidlist, 0, mbuf_slot_len); + + /* cgroup may have no mbuf attached, because the mbuf pool + * has a max num + * here we let file open success, so, seq_ops must + * check mring point + */ + if (!mb) + return 0; + + snapshot_mbuf((struct mbuf_slot *)ctx->procs1.pidlist, mb, &mb->slot_lock); + + return 0; +} + +void cgroup_mbuf_release(struct kernfs_open_file *of) +{ + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->psi.trigger) { + kfree(ctx->psi.trigger); + ctx->psi.trigger = NULL; + } + + if (ctx->procs1.pidlist) { + vfree(ctx->procs1.pidlist); + ctx->procs1.pidlist = NULL; + } +} + /* * Get cgroup struct from task_struct for mbuf and sli. * @@ -4082,7 +4133,7 @@ ssize_t mbuf_print_task(struct task_struct *task, const char *fmt, ...) if (mb->ops) { va_start(args, fmt); - mb->ops->write(cgrp, fmt, args); + mb->ops->write(mb, fmt, args); va_end(args); } @@ -4106,7 +4157,7 @@ ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...) if (mb->ops) { va_start(args, fmt); - mb->ops->write(cgrp, fmt, args); + mb->ops->write(mb, fmt, args); va_end(args); } @@ -5645,16 +5696,23 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_net_quality_show, .release = cgroup_net_release, }, -#endif { .name = "mbuf", .flags = CFTYPE_NOT_ON_ROOT, + .open = cgroup_mbuf_open, .seq_show = cgroup_mbuf_show, .seq_start = cgroup_mbuf_start, .seq_next = cgroup_mbuf_next, .seq_stop = cgroup_mbuf_stop, + .release = cgroup_mbuf_release, }, +#endif #ifdef CONFIG_CGROUP_SLI + { + .name = "sli.io", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_sli_io_show, + }, { .name = "sli.memory", .flags = CFTYPE_NOT_ON_ROOT, @@ -6141,6 +6199,11 @@ static inline bool cgroup_need_mbuf(struct cgroup *cgrp) return true; #endif +#if IS_ENABLED(CONFIG_BLK_CGROUP) + if (cgroup_css(cgrp, cgroup_subsys[io_cgrp_id])) + return true; +#endif + return false; } @@ -6399,10 +6462,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (cgrp->mbuf) mbuf_free(cgrp); -#ifdef CONFIG_CGROUP_SLI - if (cgrp->sctx) - sctx_free(cgrp); -#endif /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); diff --git a/kernel/cgroup/mbuf.c b/kernel/cgroup/mbuf.c index 6d7cb88273aa04e2ffeab1a383099c07a934e619..78f81312d455725e24f43bfa30310c9fd2cece40 100644 --- a/kernel/cgroup/mbuf.c +++ b/kernel/cgroup/mbuf.c @@ -1,11 +1,11 @@ -// SPDX-License-Identifier: GPL-2.0-only +// SPDX-License-Identifier: GPL-2.0-only /* * Quality Monitor Buffer * Aim to provide backup buffer for RQM to record critical message. * Could be used to catch critical context when abnormal jitters occur. * - * Author: bauerchen - * Copyright (C) 2021 Tencent, Inc + * Author: mengensun + * Copyright (C) 2024 Tencent, Inc */ #include @@ -21,14 +21,20 @@ /* Define max mbuf len is 8M, and min is 2M */ #define MBUF_LEN_MAX (1 << 23) #define MBUF_LEN_MIN (1 << 21) -#define MBUF_LEN_DEF MBUF_LEN_MIN +/* + * from now, every netns has a mbuf, because + * change the mbuf slot size is dangerous, so + * double the total buffer size to double + * total mbuf slot num (see MBUF_SLOTS_DEF) + */ +#define MBUF_LEN_DEF (1 << 22) #define MBUF_MSG_LEN_MAX 1024 /* Monitor buffer support max 1024 items */ #define MBUF_SLOTS_MAX 1024 #define MBUF_SLOTS_MIN 256 -#define MBUF_SLOTS_DEF 512 +#define MBUF_SLOTS_DEF 1024 /* Global mbuf metadata struct */ static struct mbuf_struct g_mbuf = { @@ -51,7 +57,7 @@ static void __init mbuf_len_update(u64 size) (u64)MBUF_LEN_MAX); } - if (size < MBUF_LEN_MIN){ + if (size < MBUF_LEN_MIN) { size = (u64) MBUF_LEN_MIN; pr_warn("mbuf: monitor buffer less [ %llu ] is not supported.\n", (u64) MBUF_LEN_MIN); @@ -112,7 +118,7 @@ void __init mbuf_bmap_init(void) L1_CACHE_BYTES); mbuf_bitmap = kmalloc(alloc_size, __GFP_HIGH|__GFP_ZERO); - if(!mbuf_bitmap){ + if (!mbuf_bitmap) { pr_err("mbuf: alloc mbuf_bitmap failed!\n"); return; } @@ -159,7 +165,7 @@ static u32 mbuf_next(struct mbuf_ring *mring, u32 curr_idx) * just goto head */ frees = mring->end_idx - next_idx; - if(frees < sizeof(struct mbuf_ring_desc)){ + if (frees < sizeof(struct mbuf_ring_desc)) { next_idx = mring->base_idx; goto next; } @@ -224,9 +230,8 @@ static int mbuf_prepare(struct mbuf_ring *mring, u32 msg_size) { u32 frees; - if (unlikely(msg_size > MBUF_MSG_LEN_MAX)) { + if (unlikely(msg_size > MBUF_MSG_LEN_MAX)) return -ENOMEM; - } while (mring->first_seq < mring->next_seq) { @@ -247,26 +252,26 @@ static int mbuf_prepare(struct mbuf_ring *mring, u32 msg_size) } /* Write monitor buffer message */ -static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size) +static ssize_t do_mbuf_write(struct mbuf_slot *mbuf, char *buffer, size_t size) { struct mbuf_ring *mring; struct mbuf_ring_desc *desc; size_t len; unsigned long flags; - if (size >= g_mbuf.mbuf_size_per_cg){ + if (size >= g_mbuf.mbuf_size_per_cg) { pr_err("mbuf: write message need less than [ %u ] bytes\n", g_mbuf.mbuf_size_per_cg); return 0; } - mring = cg->mbuf->mring; + mring = mbuf->mring; len = sizeof(struct mbuf_ring_desc) + size; - spin_lock_irqsave(&cg->mbuf->slot_lock, flags); + write_seqlock_irqsave(&mbuf->slot_lock, flags); - if (mbuf_prepare(mring, len)){ - spin_unlock_irqrestore(&cg->mbuf->slot_lock, flags); + if (mbuf_prepare(mring, len)) { + write_sequnlock_irqrestore(&mbuf->slot_lock, flags); pr_err("mbuf: Can not find enough space.\n"); return 0; } @@ -285,20 +290,25 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size) mring->next_idx += desc->len; mring->next_seq++; - spin_unlock_irqrestore(&cg->mbuf->slot_lock, flags); + write_sequnlock_irqrestore(&mbuf->slot_lock, flags); return size; } -void mbuf_reset(struct mbuf_ring *mring) +void mbuf_reset(struct mbuf_slot *mbuf) { - mring->first_idx = mring->base_idx; - mring->first_seq = 0; - mring->next_idx = mring->base_idx; - mring->next_seq = 0; + unsigned long flags; + + write_seqlock_irqsave(&mbuf->slot_lock, flags); + mbuf->mring->first_idx = mbuf->mring->base_idx; + mbuf->mring->first_seq = 0; + mbuf->mring->next_idx = mbuf->mring->base_idx; + mbuf->mring->next_seq = 0; + write_sequnlock_irqrestore(&mbuf->slot_lock, flags); } +EXPORT_SYMBOL(mbuf_reset); -static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args) +static ssize_t mbuf_write(struct mbuf_slot *mbuf, const char *fmt, va_list args) { static char buf[MBUF_MSG_LEN_MAX]; char *text = buf; @@ -308,7 +318,7 @@ static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args) t_len = vscnprintf(text, sizeof(buf), fmt, args); /* Write string to mbuf */ - ret = do_mbuf_write(cg, text, t_len); + ret = do_mbuf_write(mbuf, text, t_len); return ret; } @@ -330,23 +340,30 @@ static int get_next_mbuf_id(unsigned long *addr, u32 start) return index; } -static void mbuf_slot_init(struct mbuf_slot *mb, struct cgroup *cg, u32 index) +static void mbuf_slot_init(struct mbuf_slot *mb, + void *owner, u32 index, struct mbuf_operations *ops) { - mb->owner = cg; + mb->owner = owner; mb->idx = index; - mb->ops = &mbuf_ops; - spin_lock_init(&mb->slot_lock); - ratelimit_state_init(&mb->ratelimit, 5 * HZ,50); + + if (!ops) + mb->ops = &mbuf_ops; + else + mb->ops = ops; + + seqlock_init(&mb->slot_lock); + ratelimit_state_init(&mb->ratelimit, 5 * HZ, 50); mb->mring = (struct mbuf_ring *)((char *)mb + sizeof(struct mbuf_slot)); - mb->mring->base_idx = index * - g_mbuf.mbuf_size_per_cg + sizeof(struct mbuf_slot) + sizeof(struct mbuf_ring); + mb->mring->base_idx = index * g_mbuf.mbuf_size_per_cg + + sizeof(struct mbuf_slot) + + sizeof(struct mbuf_ring); mb->mring->end_idx = (index + 1) * g_mbuf.mbuf_size_per_cg - 1; - mbuf_reset(mb->mring); + mbuf_reset(mb); } -struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg) +struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops) { struct mbuf_slot *mb; u32 index = 0; @@ -395,25 +412,158 @@ struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg) g_mbuf.mbuf_next_id = index; mb = (struct mbuf_slot *)(g_mbuf.mbuf + index * g_mbuf.mbuf_size_per_cg); - mbuf_slot_init(mb, cg, index); + mbuf_slot_init(mb, owner, index, ops); g_mbuf.mbuf_frees--; spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags); return mb; } +EXPORT_SYMBOL(mbuf_slot_alloc_v2); -void mbuf_free(struct cgroup *cg) +struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg) +{ + return mbuf_slot_alloc_v2((void *)cg, NULL); +} +EXPORT_SYMBOL(mbuf_slot_alloc); + +void mbuf_free_slot(struct mbuf_slot *slot) { unsigned long flags; spin_lock_irqsave(&g_mbuf.mbuf_lock, flags); - /* Make current idx the next available buffer */ - g_mbuf.mbuf_next_id = cg->mbuf->idx; + g_mbuf.mbuf_next_id = slot->idx; __clear_bit(g_mbuf.mbuf_next_id, g_mbuf.mbuf_bitmap); - g_mbuf.mbuf_frees++; spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags); + +} +EXPORT_SYMBOL(mbuf_free_slot); + +void mbuf_free(struct cgroup *cg) +{ + unsigned long flags; + struct mbuf_slot *slot; + + spin_lock_irqsave(&cg->cgrp_mbuf_lock, flags); + slot = cg->mbuf; + cg->mbuf = NULL; + spin_unlock_irqrestore(&cg->cgrp_mbuf_lock, flags); + + mbuf_free_slot(slot); +} + +static u32 rd_mbuf_next(struct mbuf_ring *mring, u32 curr_idx) +{ + struct mbuf_ring_desc *cdesc, *ndesc; + u32 frees, next_idx; + void *start; + + start = (void *)(mring + 1); + cdesc = (struct mbuf_ring_desc *)(start + curr_idx); + next_idx = curr_idx + cdesc->len; + + frees = mring->end_idx - next_idx; + if (frees < sizeof(struct mbuf_ring_desc)) { + /* end */ + if (next_idx == mring->next_idx) + return next_idx; + + /*buffer wrapped to head */ + next_idx = mring->base_idx; + goto next; + } + + ndesc = (struct mbuf_ring_desc *)(start + next_idx); + + /* same magic can't be said */ + if (!ndesc->len && next_idx != mring->next_idx) + next_idx = mring->base_idx; +next: + return next_idx; } +static ssize_t rd_mbuf_read(struct mbuf_slot *mb, struct mbuf_user_desc *udesc) +{ + struct mbuf_ring_desc *desc; + ssize_t ret; + size_t i, len, tbuf_len; + char *start; + + tbuf_len = sizeof(udesc->buf); + start = (char *)(mb->mring + 1); + desc = (struct mbuf_ring_desc *)(start + udesc->user_idx); + + len = sprintf(udesc->buf, "%llu:", desc->ts_ns); + start = (char *)(desc + 1); + + for (i = 0; i < desc->text_len; i++) { + unsigned char c = start[i]; + + if (c < ' ' || c >= 127 || c == '\\') + continue; + else + udesc->buf[len++] = c; + if (len >= tbuf_len) + break; + } + + len = len >= tbuf_len ? tbuf_len - 1 : len; + udesc->buf[len] = '\n'; + udesc->user_seq++; + ret = len; + return ret; +} + +/* this ops is just for read-side abi of mbuf, mbuf has a write ops + * which is protect by spinlock, while there is no read-write side + * protection. + * you can use like follow: + * + * called snapshot_mbuf copy data from mbuf to the `dst`. then read + * the dst use the following ops + * + * all the index is offset from the end point of mring of the + * snapshot, instead of from the global mbuf memory pool + * + * btw: the private data of seq file is the ideal place to hold the + * snapshot + */ +const struct mbuf_operations rd_mbuf_ops = { + .read = rd_mbuf_read, + .next = rd_mbuf_next, +}; + +void snapshot_mbuf(struct mbuf_slot *dst, struct mbuf_slot *src, seqlock_t *lock) +{ + unsigned int seq; + + do { + /* the peer of the lock is write-side, we want writer + * go first when there is confliction, and this reader + * retry to read go get a consistent buf snapshot + */ + cond_resched(); + seq = read_seqbegin(lock); + memcpy((void *)dst, (void *)src, g_mbuf.mbuf_size_per_cg); + } while (read_seqretry(lock, seq)); + + /* all the ops in `rd_mbuf_ops` see a idx offset from the end + * point of mring. so here adjust the idx as a whole + */ + dst->mring = (struct mbuf_ring *)(dst + 1); + dst->mring->end_idx = dst->mring->end_idx - dst->mring->base_idx; + dst->mring->first_idx = dst->mring->first_idx - dst->mring->base_idx; + dst->mring->next_idx = dst->mring->next_idx - dst->mring->base_idx; + dst->mring->base_idx = 0; + dst->ops = &rd_mbuf_ops; +} +EXPORT_SYMBOL(snapshot_mbuf); + +/* the mbuf size per cg is not changed once the system booted up */ +u32 get_mbuf_slot_len(void) +{ + return g_mbuf.mbuf_size_per_cg; +} +EXPORT_SYMBOL(get_mbuf_slot_len); diff --git a/kernel/cgroup/sli.c b/kernel/cgroup/sli.c index faa4756b73e52a87af9e71834e7184a3d408a204..71e9a58b6c0cc7894d515ed65ca9d0ab35965c8a 100755 --- a/kernel/cgroup/sli.c +++ b/kernel/cgroup/sli.c @@ -18,6 +18,7 @@ #define MAX_STACK_TRACE_DEPTH 64 static DEFINE_STATIC_KEY_FALSE(sli_enabled); +DEFINE_STATIC_KEY_FALSE(sli_io_enabled); static DEFINE_STATIC_KEY_FALSE(sli_monitor_enabled); static struct sli_event_monitor default_sli_event_monitor; @@ -59,12 +60,17 @@ static const char *longterm_threshold_name[] = { "longterm_irqtime_threshold=" }; +static const char *iolat_threshold_name[] = { + "iolat_delay_threshold=" +}; + static const char *sanity_check_abbr[] = { "schedlat_", "memlat_", "longterm_", "period=", - "mbuf_enable=" + "mbuf_enable=", + "iolat_" }; static void sli_proactive_monitor_work(struct work_struct *work); @@ -96,6 +102,8 @@ static void sli_event_monitor_init(struct sli_event_monitor *event_monitor, stru memset(&event_monitor->memlat_threshold, 0xff, sizeof(event_monitor->memlat_threshold)); memset(&event_monitor->memlat_count, 0xff, sizeof(event_monitor->memlat_count)); memset(&event_monitor->longterm_threshold, 0xff, sizeof(event_monitor->longterm_threshold)); + memset(&event_monitor->iolat_threshold, 0xff, sizeof(event_monitor->iolat_threshold)); + memset(&event_monitor->iolat_count, 0xff, sizeof(event_monitor->iolat_count)); event_monitor->last_update = jiffies; event_monitor->cgrp = cgrp; @@ -153,6 +161,12 @@ static int sli_event_inherit(struct cgroup *cgrp) &cgrp_event_monitor->longterm_statistics[new_event->event_id], sli_get_longterm_statistics(cgrp, new_event->event_id)); break; + case SLI_IO_EVENT: + cgrp_event_monitor->iolat_threshold[new_event->event_id] = + READ_ONCE(event_monitor->iolat_threshold[new_event->event_id]); + cgrp_event_monitor->iolat_count[new_event->event_id] = + READ_ONCE(event_monitor->iolat_count[new_event->event_id]); + break; default: printk(KERN_ERR "%s: invalid sli_event type!\n", __func__); goto failed; @@ -182,13 +196,12 @@ static int sli_event_inherit(struct cgroup *cgrp) } static void store_task_stack(struct task_struct *task, char *reason, - u64 duration, unsigned int skipnr) + u64 duration, unsigned int skipnr, struct cgroup *cgrp) { unsigned long *entries; unsigned nr_entries = 0; unsigned long flags; int i; - struct cgroup *cgrp; entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_ATOMIC); @@ -197,7 +210,6 @@ static void store_task_stack(struct task_struct *task, char *reason, nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, skipnr); - cgrp = get_cgroup_from_task(task); spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags); mbuf_print(cgrp, "record reason:%s comm:%s pid:%d duration=%lld\n", @@ -242,6 +254,21 @@ static char * get_memlat_name(enum sli_memlat_stat_item sidx) return name; } +static char *get_iolat_name(enum sli_iolat_stat_item sidx) +{ + char *name = NULL; + + switch (sidx) { + case IO_LAT_DELAY: + name = "iolat_delay"; + break; + default: + break; + } + + return name; +} + static enum sli_lat_count get_lat_count_idx(u64 duration) { enum sli_lat_count idx; @@ -428,7 +455,7 @@ void sli_memlat_stat_end(enum sli_memlat_stat_item sidx, u64 start) char *lat_name; lat_name = get_memlat_name(sidx); - store_task_stack(current, lat_name, duration, 0); + store_task_stack(current, lat_name, duration, 0, cgrp); } } } @@ -437,6 +464,106 @@ void sli_memlat_stat_end(enum sli_memlat_stat_item sidx, u64 start) rcu_read_unlock(); } +static u64 sli_iolat_stat_gather(struct cgroup *cgrp, + enum sli_iolat_stat_item sidx, + enum sli_lat_count cidx) +{ + u64 sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += per_cpu_ptr(cgrp->sli_iolat_stat_percpu, cpu)->item[sidx][cidx]; + + return sum; +} + +int sli_iolat_stat_show(struct seq_file *m, struct cgroup *cgrp) +{ + enum sli_iolat_stat_item sidx; + + if (!static_branch_likely(&sli_io_enabled)) { + seq_printf(m, "sli_io is not enabled, please echo 1 > /proc/sli/sli_io_enabled\n"); + return 0; + } + + if (!cgrp->sli_iolat_stat_percpu) + return 0; + + for (sidx = IO_LAT_DELAY; sidx < IO_LAT_STAT_NR; sidx++) { + seq_printf(m, "%s:\n", get_iolat_name(sidx)); + seq_printf(m, "0-1ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_0_1)); + seq_printf(m, "1-4ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_1_4)); + seq_printf(m, "4-8ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_4_8)); + seq_printf(m, "8-16ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_8_16)); + seq_printf(m, "16-32ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_16_32)); + seq_printf(m, "32-64ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_32_64)); + seq_printf(m, "64-128ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_64_128)); + seq_printf(m, ">=128ms: %llu\n", sli_iolat_stat_gather(cgrp, sidx, LAT_128_INF)); + } + + return 0; +} + +int sli_iolat_max_show(struct seq_file *m, struct cgroup *cgrp) +{ + enum sli_iolat_stat_item sidx; + + if (!static_branch_likely(&sli_io_enabled)) { + seq_printf(m, "sli_io is not enabled, please echo 1 > /proc/sli/sli_io_enabled\n"); + return 0; + } + + if (!cgrp->sli_iolat_stat_percpu) + return 0; + + for (sidx = IO_LAT_DELAY; sidx < IO_LAT_STAT_NR; sidx++) { + int cpu; + unsigned long latency_sum = 0; + + for_each_possible_cpu(cpu) + latency_sum += per_cpu_ptr(cgrp->sli_iolat_stat_percpu, cpu)->latency_max[sidx]; + + seq_printf(m, "%s: %lu\n", get_iolat_name(sidx), latency_sum); + } + + return 0; +} + +void sli_iolat_stat_end(enum sli_iolat_stat_item sidx, u64 bio_start, u64 rq_alloc_time_ns, + u64 rq_io_start_time_ns, u64 sli_iolat_end_time, u64 duration, struct bio *bio, + struct cgroup *cgrp) +{ + enum sli_lat_count cidx; + + cidx = get_lat_count_idx(duration); + duration = duration >> 10; + this_cpu_inc(cgrp->sli_iolat_stat_percpu->item[sidx][cidx]); + this_cpu_add(cgrp->sli_iolat_stat_percpu->latency_max[sidx], duration); + + if (static_branch_unlikely(&sli_monitor_enabled)) { + struct sli_event_monitor *event_monitor = cgrp->cgrp_event_monitor; + + if (duration < READ_ONCE(event_monitor->iolat_threshold[sidx])) + return; + + atomic_long_inc(&event_monitor->iolat_statistics[sidx]); + if (event_monitor->mbuf_enable) { + char *lat_name; + unsigned long flags; + char b[BDEVNAME_SIZE]; + + lat_name = get_iolat_name(sidx); + spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags); + mbuf_print(cgrp, "record reason:%s devname:%s duration_us=%lld " + "bio_start=%llu req_start=%llu req_issue=%llu " + "bio_complete=%llu\n", lat_name, bio_devname(bio, b), + duration, bio_start, rq_alloc_time_ns, + rq_io_start_time_ns, sli_iolat_end_time); + spin_unlock_irqrestore(&cgrp->cgrp_mbuf_lock, flags); + } + } +} + void sli_schedlat_stat(struct task_struct *task, enum sli_schedlat_stat_item sidx, u64 delta) { struct cgroup *cgrp = NULL; @@ -465,7 +592,7 @@ void sli_schedlat_stat(struct task_struct *task, enum sli_schedlat_stat_item sid char *lat_name; lat_name = get_schedlat_name(sidx); - store_task_stack(task, lat_name, delta, 0); + store_task_stack(task, lat_name, delta, 0, cgrp); } } } @@ -627,6 +754,20 @@ static void sli_proactive_monitor_work(struct work_struct *work) sli_event_add(notify_event, event->event_type, event->event_id, (int)(statistics - last_statistics)); break; + case SLI_IO_EVENT: + statistics = (u64)atomic_long_read( + &event_monitor->iolat_statistics[event->event_id]); + atomic_long_set(&event_monitor->iolat_statistics[event->event_id], 0); + + if (event_monitor->overrun) { + event_monitor->overrun = 0; + break; + } + + if (statistics >= READ_ONCE(event_monitor->iolat_count[event->event_id])) + sli_event_add(notify_event, event->event_type, + event->event_id, statistics); + break; default: break; } @@ -640,64 +781,98 @@ static void sli_proactive_monitor_work(struct work_struct *work) css_put(&event_monitor->cgrp->self); } +struct cgroup *get_cgroup_from_task_id(struct task_struct *task, int event_nr) +{ + int id; + struct cgroup *cgrp; + + id = cpuacct_cgrp_id; + switch (event_nr) { +#if IS_ENABLED(CONFIG_MEMCG) + case SLI_MEM_EVENT: + id = memory_cgrp_id; + break; +#endif +#if IS_ENABLED(CONFIG_BLK_CGROUP) + case SLI_IO_EVENT: + id = io_cgrp_id; + break; +#endif + default: + break; + } + + /* First, try to get cpuacct/mem cgroup for V1*/ + cgrp = task_cgroup(task, id); + if (cgrp && cgrp->level) + return cgrp; + + /* + * If can not find cpuacct/mem cgroup or cpuacct/mem cgroup is root, just return + * dfl_cgrp. + */ + cgrp = task_dfl_cgroup(task); + + return cgrp; +} + void sli_update_tick(struct task_struct *tsk) { struct cgroup *cgrp; + int i; if (!static_branch_likely(&sli_monitor_enabled)) return; rcu_read_lock(); - cgrp = get_cgroup_from_task(tsk); - if (cgrp && cgroup_parent(cgrp)) { - bool ret; - int period; - unsigned long long old_value, last_update; + for (i = 0; i < SLI_EVENT_NR; i++) { + cgrp = get_cgroup_from_task_id(tsk, i); - period = cgrp->cgrp_event_monitor->period; - if (!period) - goto unlock; + if (cgrp && cgroup_parent(cgrp)) { + bool ret; + int period; + unsigned long long old_value, last_update; -retry: - last_update = READ_ONCE(cgrp->cgrp_event_monitor->last_update); - if (time_after((unsigned long)(period + last_update), jiffies)) - goto unlock; + period = cgrp->cgrp_event_monitor->period; + if (!period) + continue; - old_value = cmpxchg(&cgrp->cgrp_event_monitor->last_update, - last_update, jiffies); - if (old_value != last_update) - goto retry; +retry: + last_update = READ_ONCE(cgrp->cgrp_event_monitor->last_update); + if (time_after((unsigned long)(period + last_update), jiffies)) + continue; - /* - * Current jiffies should be somewhere between period and 8 * period, - * otherwise we consider the it is overrun and should be abandoned. - */ - if (time_before((unsigned long)((period << 3) + last_update), jiffies)) - cgrp->cgrp_event_monitor->overrun = 1; + old_value = cmpxchg(&cgrp->cgrp_event_monitor->last_update, + last_update, jiffies); + if (old_value != last_update) + goto retry; - rcu_read_unlock(); + /* + * Current jiffies should be somewhere between period and 8 * period, + * otherwise we consider the it is overrun and should be abandoned. + */ + if (time_before((unsigned long)((period << 3) + last_update), jiffies)) + cgrp->cgrp_event_monitor->overrun = 1; - ret = css_tryget(&cgrp->self); - if (!ret) - return; + ret = css_tryget(&cgrp->self); + if (!ret) + continue; - /* - * The sli trace work may have a lot a work to do, and should send - * the event to polling tasks. So we don't do the work in interrupt - * context(put the work to the workqueue). - */ - ret = queue_work(sli_workqueue, &cgrp->cgrp_event_monitor->sli_event_work); - /* - * If work had been pushed to workqueue and not been executed, there is no - * need to push it again. So we must put the css refcount. - */ - if (!ret) - css_put(&cgrp->self); - return; + /* + * The sli trace work may have a lot a work to do, and should send + * the event to polling tasks. So we don't do the work in interrupt + * context(put the work to the workqueue). + */ + ret = queue_work(sli_workqueue, &cgrp->cgrp_event_monitor->sli_event_work); + /* + * If work had been pushed to workqueue and not been executed, there is no + * need to push it again. So we must put the css refcount. + */ + if (!ret) + css_put(&cgrp->self); + } } - -unlock: rcu_read_unlock(); } @@ -780,7 +955,8 @@ static unsigned long sli_get_longterm_statistics(struct cgroup *cgrp, return latency_sum; } -static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec) +static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec, + int index) { char *str; int i, len, ret; @@ -788,15 +964,21 @@ static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec) /* Replace the delimiter with '\0' */ len = strlen(buf); - for (i = 0; i < len; i++) { - if (buf[i] == ',' || buf[i] == ' ') { - buf[i] = '\0'; - break; + if (len == 0) + return -EINVAL; + + /* longterm_rundelay/irqtime dont need check */ + if (index != 2) { + for (i = 0; i < len; i++) { + if (buf[i] == ',' || buf[i] == ' ') { + buf[i] = '\0'; + break; + } } - } - if (i == len) - return -EINVAL; + if (i == len) + return -EINVAL; + } /* Parse the value for theshold */ ret = kstrtou64(buf, 0, &value); @@ -805,6 +987,10 @@ static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec) sec->threshold = sli_convert_value(value, false); + /* longterm_rundelay/irqtime dont need count= param*/ + if (index == 2) + return 0; + /* Move the pointer to the positon which after the delimiter */ buf += (i + 1); len -= (i + 1); @@ -842,7 +1028,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec return -EINVAL; buf += min_len; - ret = sli_parse_threshold(buf, sec); + ret = sli_parse_threshold(buf, sec, index); if (ret) return ret; @@ -860,7 +1046,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec return -EINVAL; buf += min_len; - ret = sli_parse_threshold(buf, sec); + ret = sli_parse_threshold(buf, sec, index); if (ret) return ret; @@ -878,7 +1064,7 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec return -EINVAL; buf += min_len; - ret = sli_parse_threshold(buf, sec); + ret = sli_parse_threshold(buf, sec, index); if (ret) return ret; @@ -906,7 +1092,24 @@ static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec sec->mbuf_enable = !!value; break; + case 5: + for (i = 0; i < ARRAY_SIZE(iolat_threshold_name); i++) { + min_len = min(len, (int)strlen((const char *)iolat_threshold_name[i])); + if (!strncmp(iolat_threshold_name[i], buf, min_len)) + break; + } + + if (i == ARRAY_SIZE(iolat_threshold_name)) + return -EINVAL; + + buf += min_len; + ret = sli_parse_threshold(buf, sec, index); + if (ret) + return ret; + sec->event_type = SLI_IO_EVENT; + sec->event_id = i; + break; default: return -EINVAL; } @@ -1054,6 +1257,14 @@ ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf, sli_get_longterm_statistics(cgrp, sec.event_id)); ret = sli_event_update(event_monitor, &sec, last_threshold); break; + case SLI_IO_EVENT: + last_threshold = event_monitor->iolat_threshold[sec.event_id]; + WRITE_ONCE(event_monitor->iolat_threshold[sec.event_id], sec.threshold); + WRITE_ONCE(event_monitor->iolat_count[sec.event_id], sec.count); + smp_wmb(); + atomic_long_set(&event_monitor->iolat_statistics[sec.event_id], 0); + ret = sli_event_update(event_monitor, &sec, last_threshold); + break; default: break; } @@ -1067,6 +1278,115 @@ ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf, return ret; } +int io_cgroup_sli_control_show(struct seq_file *sf, void *v) +{ + int i; + unsigned long long threshold, count; + struct cgroup *cgrp; + struct sli_event_monitor *event_monitor; + + cgrp = seq_css(sf)->cgroup; + if (cgroup_parent(cgrp)) + event_monitor = cgrp->cgrp_event_monitor; + else + event_monitor = &default_sli_event_monitor; + + inode_lock_shared(file_inode(sf->file)); + seq_printf(sf, "period: %d\n", event_monitor->period); + seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable); + + for (i = 0; i < IO_LAT_STAT_NR; i++) { + threshold = sli_convert_value(event_monitor->iolat_threshold[i], true); + count = sli_convert_value(event_monitor->iolat_count[i], true); + + seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i), + threshold, count); + } + + inode_unlock_shared(file_inode(sf->file)); + return 0; +} + +int mem_cgroup_sli_control_show(struct seq_file *sf, void *v) +{ + int i; + unsigned long long threshold, count; + struct cgroup *cgrp; + struct sli_event_monitor *event_monitor; + + cgrp = seq_css(sf)->cgroup; + if (cgroup_parent(cgrp)) + event_monitor = cgrp->cgrp_event_monitor; + else + event_monitor = &default_sli_event_monitor; + + inode_lock_shared(file_inode(sf->file)); + seq_printf(sf, "period: %d\n", event_monitor->period); + seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable); + + for (i = 0; i < MEM_LAT_STAT_NR; i++) { + threshold = sli_convert_value(event_monitor->memlat_threshold[i], true); + count = sli_convert_value(event_monitor->memlat_count[i], true); + + seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_memlat_name(i), + threshold, count); + } + + inode_unlock_shared(file_inode(sf->file)); + return 0; +} + +int cpuacct_cgroup_sli_control_show(struct seq_file *sf, void *v) +{ + int i; + unsigned long long threshold, count; + struct cgroup *cgrp; + struct sli_event_monitor *event_monitor; + + cgrp = seq_css(sf)->cgroup; + if (cgroup_parent(cgrp)) + event_monitor = cgrp->cgrp_event_monitor; + else + event_monitor = &default_sli_event_monitor; + + inode_lock_shared(file_inode(sf->file)); + seq_printf(sf, "period: %d\n", event_monitor->period); + seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable); + + for (i = 0; i < SCHEDLAT_STAT_NR; i++) { + threshold = sli_convert_value(event_monitor->schedlat_threshold[i], true); + count = sli_convert_value(event_monitor->schedlat_count[i], true); + + seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_schedlat_name(i), + threshold, count); + } + + for (i = 0; i < SLI_LONGTERM_NR; i++) { + threshold = sli_convert_value(event_monitor->longterm_threshold[i], true); + + seq_printf(sf, "%s: threshold: %llu\n", get_longterm_name(i), threshold); + } + + if (!cgroup_parent(cgrp)) { + for (i = 0; i < MEM_LAT_STAT_NR; i++) { + threshold = sli_convert_value(event_monitor->memlat_threshold[i], true); + count = sli_convert_value(event_monitor->memlat_count[i], true); + + seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_memlat_name(i), + threshold, count); + } + for (i = 0; i < IO_LAT_STAT_NR; i++) { + threshold = sli_convert_value(event_monitor->iolat_threshold[i], true); + count = sli_convert_value(event_monitor->iolat_count[i], true); + + seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i), + threshold, count); + } + } + + inode_unlock_shared(file_inode(sf->file)); + return 0; +} int cgroup_sli_control_show(struct seq_file *sf, void *v) { int i; @@ -1080,6 +1400,7 @@ int cgroup_sli_control_show(struct seq_file *sf, void *v) else event_monitor = &default_sli_event_monitor; + inode_lock_shared(file_inode(sf->file)); seq_printf(sf, "period: %d\n", event_monitor->period); seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable); @@ -1105,6 +1426,14 @@ int cgroup_sli_control_show(struct seq_file *sf, void *v) seq_printf(sf, "%s: threshold: %llu\n", get_longterm_name(i), threshold); } + for (i = 0; i < IO_LAT_STAT_NR; i++) { + threshold = sli_convert_value(event_monitor->iolat_threshold[i], true); + count = sli_convert_value(event_monitor->iolat_count[i], true); + + seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_iolat_name(i), + threshold, count); + } + inode_unlock_shared(file_inode(sf->file)); return 0; } @@ -1330,6 +1659,7 @@ static ssize_t sli_enabled_write(struct file *file, const char __user *ubuf, goto out; } + inode_lock(file_inode(file)); switch (val) { case '0': if (static_key_enabled(&sli_enabled)) @@ -1342,6 +1672,7 @@ static ssize_t sli_enabled_write(struct file *file, const char __user *ubuf, default: ret = -EINVAL; } + inode_unlock(file_inode(file)); out: return ret; @@ -1355,15 +1686,73 @@ static const struct file_operations sli_enabled_fops = { .release = single_release, }; +static int sli_io_enabled_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", static_key_enabled(&sli_io_enabled)); + return 0; +} + +static int sli_io_enabled_open(struct inode *inode, struct file *file) +{ + return single_open(file, sli_io_enabled_show, NULL); +} + +static ssize_t sli_io_enabled_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + char val = -1; + int ret = count; + + if (count < 1 || *ppos) { + ret = -EINVAL; + goto out; + } + + if (copy_from_user(&val, ubuf, 1)) { + ret = -EFAULT; + goto out; + } + + inode_lock(file_inode(file)); + switch (val) { + case '0': + if (static_key_enabled(&sli_io_enabled)) + static_branch_disable(&sli_io_enabled); + break; + case '1': + if (!static_key_enabled(&sli_io_enabled)) + static_branch_enable(&sli_io_enabled); + break; + default: + ret = -EINVAL; + } + inode_unlock(file_inode(file)); + +out: + return ret; +} + +static const struct file_operations sli_io_enabled_fops = { + .open = sli_io_enabled_open, + .read = seq_read, + .write = sli_io_enabled_write, + .llseek = seq_lseek, + .release = single_release, +}; + int sli_cgroup_alloc(struct cgroup *cgroup) { if (!cgroup_need_sli(cgroup)) return 0; spin_lock_init(&cgroup->cgrp_mbuf_lock); + cgroup->sli_iolat_stat_percpu = alloc_percpu(struct sli_iolat_stat); + if (!cgroup->sli_iolat_stat_percpu) + goto out; + cgroup->sli_memlat_stat_percpu = alloc_percpu(struct sli_memlat_stat); if (!cgroup->sli_memlat_stat_percpu) - goto out; + goto free_iolat_percpu; cgroup->sli_schedlat_stat_percpu = alloc_percpu(struct sli_schedlat_stat); if (!cgroup->sli_schedlat_stat_percpu) @@ -1385,6 +1774,8 @@ int sli_cgroup_alloc(struct cgroup *cgroup) free_percpu(cgroup->sli_schedlat_stat_percpu); free_memlat_percpu: free_percpu(cgroup->sli_memlat_stat_percpu); +free_iolat_percpu: + free_percpu(cgroup->sli_iolat_stat_percpu); out: return -ENOMEM; } @@ -1393,6 +1784,8 @@ void sli_cgroup_free(struct cgroup *cgroup) { struct sli_event *event, *event_tmp; + if (cgroup->sctx) + sctx_free(cgroup); /* * Cgroup's subsys would be cleared before sli_cgroup_free() had been called. * So we use !cgroup->cgrp_event_monitor instead of cgroup_need_sli to check @@ -1401,6 +1794,7 @@ void sli_cgroup_free(struct cgroup *cgroup) if (!cgroup->cgrp_event_monitor) return; + free_percpu(cgroup->sli_iolat_stat_percpu); free_percpu(cgroup->sli_memlat_stat_percpu); free_percpu(cgroup->sli_schedlat_stat_percpu); /* Free memory from the event list */ @@ -1422,6 +1816,7 @@ static int __init sli_proc_init(void) } proc_mkdir("sli", NULL); proc_create("sli/sli_enabled", 0, NULL, &sli_enabled_fops); + proc_create("sli/sli_io_enabled", 0, NULL, &sli_io_enabled_fops); return 0; } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0259cfdeff8379e76e406c79371232dca7bf343f..4f49ec1eb14fa3011361ac50e87972cfe6424642 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -411,14 +411,18 @@ static struct cftype files[] = { .name = "uptime", .seq_show = cpuacct_uptime_show, }, +#ifdef CONFIG_RQM { .name = "mbuf", .flags = CFTYPE_NOT_ON_ROOT, + .open = cgroup_mbuf_open, .seq_show = cgroup_mbuf_show, .seq_start = cgroup_mbuf_start, .seq_next = cgroup_mbuf_next, .seq_stop = cgroup_mbuf_stop, + .release = cgroup_mbuf_release, }, +#endif #ifdef CONFIG_PSI { .name = "cpu.pressure", @@ -443,7 +447,7 @@ static struct cftype files[] = { { .name = "sli.control", .write = cgroup_sli_control_write, - .seq_show = cgroup_sli_control_show, + .seq_show = cpuacct_cgroup_sli_control_show, }, { .name = "sli.monitor", diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ca9a6dedd786fd4d68bd744efc8992d8f261b9e8..d5c39baf26a96971d6205d8ffed614bebb02078c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1617,6 +1617,8 @@ static void priority_kill_process(struct task_struct *victim, struct task_struct *p; struct mm_struct *mm; struct mem_cgroup *memcg; + unsigned long flags; + struct cgroup *cgrp; p = find_lock_task_mm(victim); if (!p) { @@ -1637,8 +1639,11 @@ static void priority_kill_process(struct task_struct *victim, /* Now we select [ victim ] to kill, just record it to mbuf */ memcg = mem_cgroup_from_task(victim); - mbuf_print(memcg->css.cgroup, "memqos: Killing process [ %s ] pid [ %d ] for memory reclaim", + cgrp = memcg->css.cgroup; + spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags); + mbuf_print(cgrp, "memqos: Killing process [ %s ] pid [ %d ] for memory reclaim", victim->comm, victim->pid); + spin_unlock_irqrestore(&cgrp->cgrp_mbuf_lock, flags); /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; @@ -7714,6 +7719,18 @@ static struct cftype mem_cgroup_legacy_files[] = { .release = cgroup_pressure_release, }, #endif +#ifdef CONFIG_RQM + { + .name = "mbuf", + .flags = CFTYPE_NOT_ON_ROOT, + .open = cgroup_mbuf_open, + .seq_show = cgroup_mbuf_show, + .seq_start = cgroup_mbuf_start, + .seq_next = cgroup_mbuf_next, + .seq_stop = cgroup_mbuf_stop, + .release = cgroup_mbuf_release, + }, +#endif #ifdef CONFIG_CGROUP_SLI { .name = "sli", @@ -7725,6 +7742,22 @@ static struct cftype mem_cgroup_legacy_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = mem_cgroup_sli_max_show, }, + { + .name = "sli.control", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_sli_control_write, + .seq_show = mem_cgroup_sli_control_show, + }, + { + .name = "sli.monitor", + .flags = CFTYPE_NOT_ON_ROOT, + .open = cgroup_sli_monitor_open, + .seq_show = cgroup_sli_monitor_show, + .seq_start = cgroup_sli_monitor_start, + .seq_next = cgroup_sli_monitor_next, + .seq_stop = cgroup_sli_monitor_stop, + .poll = cgroup_sli_monitor_poll, + }, #endif #ifdef CONFIG_EMM_MEMORY_RECLAIM { diff --git a/net/Kconfig b/net/Kconfig index 1d4211802ebdd29393e465f2ccb9a9bae54016ea..d45927d9593bcf49a5819504b4aa9cc395988c87 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -99,6 +99,15 @@ source "net/netlabel/Kconfig" endif # if INET +config NETNS_MBUF + bool "attach a mbuf to net namespace" + depends on RQM && INET && PROC_FS + default y + ---help--- + this allows attach a mbuf to each net namespace. + + if you are unsure how to answer this question, answer N. + config NETWORK_SECMARK bool "Security Marking" help diff --git a/net/core/Makefile b/net/core/Makefile index 518cdc0878c94f9df30625b281b15e6c62b8a277..73c483e230135ebdc69af29730d17e0cc29525c7 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o +obj-$(CONFIG_NETNS_MBUF) += netns_mbuf.o diff --git a/net/core/netns_mbuf.c b/net/core/netns_mbuf.c new file mode 100644 index 0000000000000000000000000000000000000000..81dee51f5c85f5035c8b21c3c2b59e6590988975 --- /dev/null +++ b/net/core/netns_mbuf.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* make mbuf can be used by net namespace + * + * Author: mengensun + * Author: yuehongwu + * Copyright (C) 2024 Tencent, Inc + */ +#include +#include +#include + +#include +#include + +extern int sysctl_qos_mbuf_enable; +struct mbuf_seq_data { + struct seq_net_private snp; + struct mbuf_user_desc udesc; + struct mbuf_slot snapshot[]; +}; + +static inline struct mbuf_slot *get_net_mbuf(struct net *net) +{ + return net->mbuf.slot; +} + +/* not controlled by sysctl_qos_mbuf_enable because we will + * have a /proc/net/ipv4/netlat/enable in later patch + */ +ssize_t net_mbuf_print(struct net *net, const char *fmt, ...) +{ + va_list args; + struct mbuf_slot *slot; + + slot = net->mbuf.slot; + if (!slot || !__ratelimit(&slot->ratelimit)) + goto out; + + va_start(args, fmt); + slot->ops->write(slot, fmt, args); + va_end(args); +out: + return 0; +} +EXPORT_SYMBOL(net_mbuf_print); + +/* udesc is the user side interface, used to get data from mbuf, + * we can alloc a udesc per user, not to alloc a udesc and bind + * to mbuf when user accessing mbuf. + * + * seq file private data is the ideal place to hold the udesc + * if we put udesc in seq file private data all things is simple + */ +static void *netns_mbuf_start(struct seq_file *s, loff_t *pos) +{ + u32 index; + struct mbuf_user_desc *udesc; + struct mbuf_seq_data *pd; + + pd = s->private; + udesc = &pd->udesc; + index = *pos; + + /* why: see seq_mbuf_open */ + if (!pd->snapshot->mring) + return NULL; + + /* If already reach end, just return */ + if (index && index == pd->snapshot->mring->next_idx) + return NULL; + + udesc->user_idx = pd->snapshot->mring->first_idx; + udesc->user_seq = pd->snapshot->mring->first_seq; + + /* Maybe reach end or empty */ + if (udesc->user_idx == pd->snapshot->mring->next_idx) + return NULL; + return udesc; +} + +static void *netns_mbuf_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct mbuf_seq_data *pd; + struct mbuf_user_desc *udesc = v; + + pd = s->private; + + /* why: see seq_mbuf_open */ + if (!pd->snapshot->mring) + return NULL; + + udesc->user_idx = pd->snapshot->ops->next(pd->snapshot->mring, + udesc->user_idx); + *pos = udesc->user_idx; + if (udesc->user_idx == pd->snapshot->mring->next_idx) + return NULL; + + return udesc; +} + +static void netns_mbuf_stop(struct seq_file *s, void *v) { } + +static int netns_mbuf_show(struct seq_file *s, void *v) +{ + ssize_t ret; + struct mbuf_seq_data *pd; + struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v; + + pd = s->private; + + /* why: see seq_mbuf_open */ + if (!pd->snapshot->mring) + return 0; + + memset(udesc->buf, 0, sizeof(udesc->buf)); + ret = pd->snapshot->ops->read(pd->snapshot, udesc); + if (ret > 0) + seq_printf(s, "%s", udesc->buf); + return 0; +} + +static int seq_mbuf_open(struct inode *inode, struct file *file) +{ + struct mbuf_seq_data *p; + struct mbuf_slot *mbuf; + + p = seq_open_net_large_private(inode, file); + + if (IS_ERR(p)) + return PTR_ERR(p); + + mbuf = get_net_mbuf(p->snp.net); + /* netns may have no mbuf attached, because the mbuf + * pool has a max num + * here we let file open success, so, seq_ops must + * check mring point + * + * btw: we memzerod the private in + * seq_open_net_large_private + */ + if (!mbuf) + return 0; + + snapshot_mbuf(p->snapshot, mbuf, &mbuf->slot_lock); + return 0; +} + +/* this function is token from seq_release_net, all is the + * same except for using **vfree** to free the private + */ +static int seq_mbuf_release(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + + put_net(seq_file_net(seq)); + vfree(seq->private); + seq->private = NULL; + seq_release(ino, f); + return 0; +} + +/* when write clear the data */ +ssize_t seq_mbuf_write(struct file *f, const char __user *ubuf, + size_t size, loff_t *_pos) +{ + struct seq_file *seq = f->private_data; + struct mbuf_seq_data *p; + struct mbuf_slot *mb; + + p = seq->private; + mb = get_net_mbuf(p->snp.net); + + /* the netns not attached mbuf */ + if (!mb) + return size; + + mbuf_reset(mb); + return size; +} + +/* seq_read have a mutex lock hold when called thoes function + * while the mutex lock is bind to struct file, not to inode, + * that mutex lock can control mutex access to mbuf among tasks + * which have the same file object (eg: muti-threads of + * a process) + * + * if there are muti-process access the mbuf, there have no + * mutex accessing. + */ +static const struct seq_operations mbuf_seq_ops = { + .show = netns_mbuf_show, + .start = netns_mbuf_start, + .next = netns_mbuf_next, + .stop = netns_mbuf_stop, +}; + +static const struct file_operations mbuf_seq_fops = { + .open = seq_mbuf_open, + .read = seq_read, + .write = seq_mbuf_write, + .llseek = seq_lseek, + .release = seq_mbuf_release, +}; + +extern struct proc_dir_entry *proc_create_net_data_ops(const char *name, + umode_t mode, struct proc_dir_entry *parent, + const struct seq_operations *seq_ops, + unsigned int state_size, void *data, + const struct file_operations *fops); + +static int __net_init net_mbuf_init(struct net *net) +{ + int ret = 0; + + /* if mbuf alloc failed, make the netns create success + * + * returning error here will put a limit on max netns + * can be created on current system + * + * btw: mbuf_slot has a max num 1024 for now, if mbuf_slot + * is all used, more allocing may failed, what we can do + * is make usr interface not changed, and make netlat + * `speak nothing` + * cgroup is used for kabi + * + * When the IPv4 protocol stack is initialized, the root netns + * (init_net) needs a mbuf_slot, be sure that the + * mbuf_slot can be allocated for the first time. + */ + if (sysctl_qos_mbuf_enable || net == &init_net) { + net->mbuf.slot = mbuf_slot_alloc_v2((void *)net, NULL); + if (!net->mbuf.slot) + pr_err("netns: fail alloc mbuf"); + } else + net->mbuf.slot = NULL; + + net->mbuf.twatcher = proc_net_mkdir(net, "twatcher", net->proc_net); + if (!net->mbuf.twatcher) { + ret = -ENOMEM; + goto free_mbuf; + } + + net->mbuf.log = proc_create_net_data_ops("log", S_IFREG | 0644, + net->mbuf.twatcher, + &mbuf_seq_ops, + sizeof(struct mbuf_seq_data) + get_mbuf_slot_len(), + NULL, &mbuf_seq_fops); + if (!net->mbuf.log) { + ret = -ENOMEM; + goto remove_watcher; + } + return ret; + +remove_watcher: + remove_proc_entry("twatcher", net->proc_net); + +free_mbuf: + if (net->mbuf.slot) + mbuf_free_slot(net->mbuf.slot); + return ret; +} + +static void __net_exit net_mbuf_exit(struct net *net) +{ + remove_proc_entry("log", net->mbuf.twatcher); + remove_proc_entry("twatcher", net->proc_net); + + /* if mbuf allocate failed, no need to free */ + if (!net->mbuf.slot) + return; + mbuf_free_slot(net->mbuf.slot); +} + +static struct pernet_operations net_mbuf_ops = { + .init = net_mbuf_init, + .exit = net_mbuf_exit, +}; + +int inet_mbuf_init(void) +{ + return register_pernet_subsys(&net_mbuf_ops); +} +EXPORT_SYMBOL(inet_mbuf_init); + +void inet_mbuf_exit(void) +{ + unregister_pernet_subsys(&net_mbuf_ops); +} +EXPORT_SYMBOL(inet_mbuf_exit); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index f5af8c6b2f87ecb53f55914f4044c9cff5bf1896..3d305a02fef8b92bcab0d970fae5ae413648f070 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -432,6 +432,18 @@ config INET_UDP_DIAG Support for UDP socket monitoring interface used by the ss tool. If unsure, say Y. +config NETLAT + bool "INET: allow collect netlat info" + depends on NETNS_MBUF + default y + ---help--- + enable some hook in net stack to collector some latency info. if + the lentency is bigger then configured by user interface, then + print msg to mbuf. + + if unsure, say N. + + config INET_RAW_DIAG tristate "RAW: socket monitoring interface" depends on INET_DIAG && (IPV6 || IPV6=n) diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9e1a186a3671e249f3d10b14497098772447267d..921a5c8186ef7f5aa6fc8a5d2e73402435f16bd4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +obj-$(CONFIG_NETLAT) += netlat.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 36405511052abfc2c6a207445ce9ff8020732397..26d35bca543681ea558fa8aaa08c0665c92ace7f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -2075,6 +2075,10 @@ fs_initcall(inet_init); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS +#ifdef CONFIG_NETNS_MBUF +extern int inet_mbuf_init(void); +extern void inet_mbuf_exit(void); +#endif static int __init ipv4_proc_init(void) { int rc = 0; @@ -2087,11 +2091,19 @@ static int __init ipv4_proc_init(void) goto out_udp; if (ping_proc_init()) goto out_ping; +#ifdef CONFIG_NETNS_MBUF + if (inet_mbuf_init()) + goto out_mbuf; +#endif if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: +#ifdef CONFIG_NETNS_MBUF + inet_mbuf_exit(); +out_mbuf: +#endif ping_proc_exit(); out_ping: udp4_proc_exit(); diff --git a/net/ipv4/netlat.c b/net/ipv4/netlat.c new file mode 100644 index 0000000000000000000000000000000000000000..3984e85109683706f9092ff1f5b060c1a17d09b4 --- /dev/null +++ b/net/ipv4/netlat.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Net Latency Monitor base on Quality Monitor Buffer + * Aim to provide net latency for a long running system + * + * Author: mengensun + * Author: yuehongwu + * Copyright (C) 2024 Tencent, Inc + */ + +#include +#include +#include +#include +#include "netlat.h" + +struct netlat_net_data { + int ack; + int pick; + int queue; + int enable; + unsigned long *ports; + struct ctl_table_header *netlat_hdr; +}; + +static unsigned int netlat_net_id __read_mostly; +DEFINE_STATIC_KEY_FALSE(enable_netlat); + +static inline int get_ack_lat(struct net *net) +{ + struct netlat_net_data *pdata; + + pdata = net_generic(net, netlat_net_id); + return pdata->ack; +} + +static inline int get_pick_lat(struct net *net) +{ + struct netlat_net_data *pdata; + + pdata = net_generic(net, netlat_net_id); + return pdata->pick; +} + +static inline int get_queue_lat(struct net *net) +{ + struct netlat_net_data *pdata; + + pdata = net_generic(net, netlat_net_id); + return pdata->queue; +} + +static inline long *get_net_ports(struct net *net) +{ + struct netlat_net_data *pdata; + + pdata = net_generic(net, netlat_net_id); + return pdata->ports; +} + +static inline u32 get_rtxq_skb_jiffies(struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->first_xmit_time; +} + +static inline void set_rtxq_skb_jiffies(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->first_xmit_time = tcp_jiffies32; +} + +/* sk is not used for now, but, may be used in the future + */ +void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst, + struct sk_buff *src) +{ + if (!static_branch_unlikely(&enable_netlat)) + return; + TCP_SKB_CB(dst)->first_xmit_time = TCP_SKB_CB(src)->first_xmit_time; +} +EXPORT_SYMBOL(netlat_copy_rtxq_skb); + +static inline u32 tcp_jiffies32_delt(struct sk_buff *skb) +{ + u32 j1, j2; + + j1 = tcp_jiffies32; + j2 = get_rtxq_skb_jiffies(skb); + + /* here leave a small time windows + * when skb is alloced ack_num is inited to 0 + * if we do not touch the time stamp in ack_num + * it is zero + */ + if (!j2) + return 0; + + if (likely(j1 >= j2)) + return j1 - j2; + /* when u32 is wrap around */ + return U32_MAX - (j2 - j1) + 1; +} + +/* sk is not used for now, but, may be used in the future + */ +void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb) +{ + if (!static_branch_unlikely(&enable_netlat)) + return; + set_rtxq_skb_jiffies(skb); +} +EXPORT_SYMBOL(netlat_tcp_enrtxqueue); + +/* print msg to per net mbuf when ack latency is + * watched + */ +void netlat_ack_check(struct sock *sk, struct sk_buff *skb) +{ + struct net *net; + s64 thresh; + s64 lat; + long *ports; + + if (!static_branch_unlikely(&enable_netlat)) + return; + + net = sock_net(sk); + + thresh = get_ack_lat(net); + if (!thresh) + return; + + lat = tcp_jiffies32_delt(skb); + if (lat < thresh) + return; + + ports = get_net_ports(net); + if (!test_bit(sk->sk_num, ports)) + return; + + net_mbuf_print(net, "TCP AC %u %pI4 %d %pI4 %d\n", + (unsigned int)(jiffies_to_msecs(lat)), + &sk->sk_rcv_saddr, (int)sk->sk_num, + &sk->sk_daddr, (int)ntohs(sk->sk_dport)); +} +EXPORT_SYMBOL(netlat_ack_check); + +/* netlat/enable only can be seen in root netns + * + * following three function must be called after lock + * the `lock` above we follow the following rule + * + * 1. when disable `enable`: if we have opened the + * net_timestamp, closed it + * + * 2. when enable `enable`: if `pick/queue` need + * net_timestamp, enabled it + * + * 3. when `pick/queue` are writing and need enable + * net_timestamp and if `enable` disabled, just + * say `i need net_timestamp` and do nothing leaveing + * it to 2 above + * + * 4. when `pick/queue` are writing and need enable + * net_timestamp and if `enable` enabled, just + * enable net_timestamp by themself + */ +static struct mutex lock = __MUTEX_INITIALIZER(lock); +static unsigned long need_time_stamp; + +/* for pick/queue write: see comment above */ +static void handle_net_timestamp(bool closed) +{ + /*!0->0*/ + if (closed) { + need_time_stamp--; + if (need_time_stamp == 0 && + static_branch_unlikely(&enable_netlat)) + net_disable_timestamp(); + return; + } + + /*0->!0*/ + need_time_stamp++; + if (need_time_stamp == 1 && + static_branch_unlikely(&enable_netlat)) + net_enable_timestamp(); +} + +/* for enable write: see comment above */ +static void handle_netlat_enable(bool closed) +{ + /*!0->0*/ + if (closed) { + if (need_time_stamp) + net_disable_timestamp(); + static_branch_disable(&enable_netlat); + return; + } + + /*0->!0*/ + if (need_time_stamp) + net_enable_timestamp(); + static_branch_enable(&enable_netlat); +} + +/* for netns exits: see comment above */ +static void handle_net_timestamp_exit(bool queue, bool pick) +{ + need_time_stamp -= queue; + need_time_stamp -= pick; + + if (!static_branch_unlikely(&enable_netlat)) + return; + /* if we dec the counter to zero and netlat enabled + * disable the timestamp + */ + if (!need_time_stamp && (queue || pick)) + net_disable_timestamp(); +} + +static int proc_do_netlat_pick(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int prev; + int ret; + struct netlat_net_data *pdata; + + mutex_lock(&lock); + + pdata = container_of(table->data, struct netlat_net_data, pick); + prev = pdata->pick; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + /* only change timestamp from 0->!0 or !0->0 */ + if (!!prev == !!pdata->pick) + goto unlock; + handle_net_timestamp(!!prev); + +unlock: + mutex_unlock(&lock); + return ret; +} + +static int proc_do_netlat_queue(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int prev; + int ret; + struct netlat_net_data *pdata; + + mutex_lock(&lock); + pdata = container_of(table->data, struct netlat_net_data, queue); + prev = pdata->queue; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + /* only change timestamp from 0->!0 or !0->0 */ + if (!!prev == !!pdata->queue) + goto unlock; + handle_net_timestamp(!!prev); + +unlock: + mutex_unlock(&lock); + return ret; +} + +static int proc_do_netlat_enable(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int prev; + int ret; + struct netlat_net_data *pdata; + + mutex_lock(&lock); + + pdata = container_of(table->data, struct netlat_net_data, enable); + prev = pdata->enable; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!!prev == !!pdata->enable) + goto unlock; + handle_netlat_enable(!!prev); + +unlock: + mutex_unlock(&lock); + return ret; +} + +static struct ctl_table ipv4_netlat[] = { + { + .procname = "lports", + .data = NULL, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, + { + .procname = "ack", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { + .procname = "queue", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_netlat_queue, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { + .procname = "pick", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_netlat_pick, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { + .procname = "enable", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_netlat_enable, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int netlat_init_ipv4_ctl_table(struct net *net) +{ + int ret; + struct netlat_net_data *pdata; + struct ctl_table *table; + + table = ipv4_netlat; + pdata = net_generic(net, netlat_net_id); + + ret = 0; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(ipv4_netlat), GFP_KERNEL); + if (!table) { + ret = -ENOMEM; + goto out; + } + + /* do not export enable to son netns */ + memset(&table[4], 0, sizeof(struct ctl_table)); + } + + pdata->ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!pdata->ports) { + ret = -ENOMEM; + goto free_table; + } + + table[0].data = &pdata->ports; + table[1].data = &pdata->ack; + table[2].data = &pdata->queue; + table[3].data = &pdata->pick; + + /* do not export enable to son netns*/ + if (net_eq(net, &init_net)) + table[4].data = &pdata->enable; + + pdata->netlat_hdr = register_net_sysctl(net, "net/ipv4/netlat", table); + if (!pdata->netlat_hdr) { + ret = -ENOMEM; + goto free_ports; + } + return ret; + +free_ports: + kfree(pdata->ports); +free_table: + if (!net_eq(net, &init_net)) + kfree(table); +out: + return ret; +} + +static void netlat_exit_ipv4_ctl_table(struct net *net) +{ + struct netlat_net_data *pdata; + struct ctl_table *table; + + pdata = net_generic(net, netlat_net_id); + + table = pdata->netlat_hdr->ctl_table_arg; + unregister_net_sysctl_table(pdata->netlat_hdr); + + /* root netns never exit*/ + if (net_eq(net, &init_net)) + return; + + mutex_lock(&lock); + handle_net_timestamp_exit(!!pdata->queue, !!pdata->pick); + mutex_unlock(&lock); + + kfree(table); + kfree(pdata->ports); +} + +/* print msg to per net mbuf when latency from + * netif to queued on tcp receive queue + */ +void netlat_queue_check(struct sock *sk, struct sk_buff *skb, int flags) +{ + struct net *net; + s64 lat; + int thresh; + long *ports; + + if (!static_branch_unlikely(&enable_netlat)) + return; + + net = sock_net(sk); + if (!skb->tstamp) + return; + + thresh = get_queue_lat(net); + if (!thresh) + return; + + ports = get_net_ports(net); + if (!test_bit(sk->sk_num, ports)) + return; + + if (!skb->tstamp) + return; + + lat = ktime_to_ms(net_timedelta(skb->tstamp)); + lat = lat < 0 ? 0 : lat; + if (lat < thresh) + return; + if (flags & QUEUE_FLAG_RCV) + net_mbuf_print(net, "TCP QU %u %pI4 %d %pI4 %d\n", + (unsigned int)lat, + &sk->sk_rcv_saddr, (int)sk->sk_num, + &sk->sk_daddr, (int)ntohs(sk->sk_dport)); + else /* QUEUE_FLAG_OFO for now */ + net_mbuf_print(net, "TCP OO %u %pI4 %d %pI4 %d\n", + (unsigned int)lat, + &sk->sk_rcv_saddr, (int)sk->sk_num, + &sk->sk_daddr, (int)ntohs(sk->sk_dport)); +} +EXPORT_SYMBOL(netlat_queue_check); + +/* print msg to per net mbuf when latency from + * netif to pick by usr app + */ +void netlat_pick_check(struct sock *sk, struct sk_buff *skb) +{ + struct net *net; + s64 lat; + int thresh; + long *ports; + + if (!static_branch_unlikely(&enable_netlat)) + return; + + net = sock_net(sk); + if (!skb->tstamp) + return; + + thresh = get_pick_lat(net); + if (!thresh) + return; + + ports = get_net_ports(net); + if (!test_bit(sk->sk_num, ports)) + return; + + if (!skb->tstamp) + return; + + lat = ktime_to_ms(net_timedelta(skb->tstamp)); + lat = lat < 0 ? 0 : lat; + if (lat < thresh) + return; + + net_mbuf_print(net, "TCP PI %u %pI4 %d %pI4 %d\n", + (unsigned int)lat, &sk->sk_rcv_saddr, (int)sk->sk_num, + &sk->sk_daddr, (int)ntohs(sk->sk_dport)); +} +EXPORT_SYMBOL(netlat_pick_check); + +static struct pernet_operations netlat_net_ops = { + .init = netlat_init_ipv4_ctl_table, + .exit = netlat_exit_ipv4_ctl_table, + .id = &netlat_net_id, + .size = sizeof(struct netlat_net_data), +}; + +/* add some config file in proc + */ +int netlat_net_init(void) +{ + return register_pernet_subsys(&netlat_net_ops); +} +EXPORT_SYMBOL(netlat_net_init); + +void netlat_net_exit(void) +{ + unregister_pernet_subsys(&netlat_net_ops); +} +EXPORT_SYMBOL(netlat_net_exit); diff --git a/net/ipv4/netlat.h b/net/ipv4/netlat.h new file mode 100644 index 0000000000000000000000000000000000000000..b0a59c5585a0407470da1f44cc7cf50cd156f7c1 --- /dev/null +++ b/net/ipv4/netlat.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * + * add a netlat to monitor tcp -package latency + * + * Author: mengensun + * Copyright (C) 2024 Tencent, Inc + */ + +#ifndef H______NETLAT +#define H______NETLAT + +#ifdef CONFIG_NETLAT + +#define QUEUE_FLAG_OFO 0x1 +#define QUEUE_FLAG_RCV 0x2 + +int netlat_net_init(void); +void netlat_net_exit(void); +void netlat_ack_check(struct sock *sk, struct sk_buff *skb); +void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src); +void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb); +#define netlat_check(oldest, sk, skb) \ +do { \ + if (oldest) { \ + netlat_ack_check(sk, skb); \ + oldest = false; \ + } \ +} while (0) + +void netlat_queue_check(struct sock *sk, struct sk_buff *skb, int flags); +void netlat_pick_check(struct sock *sk, struct sk_buff *skb); + +#else /* CONFIG_NETLAT */ +static __always_inline int netlat_net_init(void) { return 0; }; +static __always_inline void netlat_net_exit(void) { }; +static __always_inline void netlat_ack_check(struct sock *sk, + struct sk_buff *skb) { }; +static __always_inline void netlat_copy_rtxq_skb(struct sock *sk, + struct sk_buff *dst, + struct sk_buff *src) { }; +static __always_inline void netlat_tcp_enrtxqueue(struct sock *sk, + struct sk_buff *skb) { }; +#define netlat_check(oldest, sk, skb) + +#define QUEUE_FLAG_OFO 0x1 +#define QUEUE_FLAG_RCV 0x2 +#define netlat_queue_check(sk, skb, flags) + +#define netlat_pick_check(sk, skb) +#endif /* !CONFIG_NETLAT */ +#endif diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d1253a5359f347e9c759bc262adac28551ebf7a5..8732a1712bf81ef5471069822c687ef7073128a6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1385,6 +1385,14 @@ static struct ctl_table ipv4_net_table[] = { { } }; +#ifdef CONFIG_NETLAT +extern int netlat_net_init(void); +/* + * this is not used for now, but sameone may used laterly + * just put here + */ +extern int netlat_net_exit(void); +#endif static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; @@ -1449,6 +1457,23 @@ static __init int sysctl_ipv4_init(void) return -ENOMEM; } +#ifdef CONFIG_NETLAT + /* + * this must after the register of ipv4_sysctl_ops + * because we are on the sub-tree of "net/ipv4" + * setup_net call init one by one, while cleanup_net + * call init one by one in reversed order + */ + if (netlat_net_init()) { + unregister_pernet_subsys(&ipv4_sysctl_ops); + unregister_net_sysctl_table(hdr); + return -ENOMEM; + } +#endif + /* + * btw: if someone adding some code here, do not + * forget the !!netlat_net_exit!! function + */ return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4063356a0c406add91a1c1c3c7ad947fc8e66355..9b60cdaa8d9333156a69dceee7c7bc72aaa40bf1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1953,6 +1953,9 @@ static int tcp_inq_hint(struct sock *sk) return inq; } +#ifdef CONFIG_NETLAT +extern void netlat_pick_check(struct sock *sk, struct sk_buff *skb); +#endif /* * This routine copies from a sock struct into the user buffer. * @@ -2179,15 +2182,23 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; - if (!(flags & MSG_PEEK)) + if (!(flags & MSG_PEEK)) { +#ifdef CONFIG_NETLAT + netlat_pick_check(sk, skb); +#endif sk_eat_skb(sk, skb); + } continue; found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); - if (!(flags & MSG_PEEK)) + if (!(flags & MSG_PEEK)) { +#ifdef CONFIG_NETLAT + netlat_pick_check(sk, skb); +#endif sk_eat_skb(sk, skb); + } break; } while (len > 0); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9a2fd0cd68a245b7d7c58f8f5026f5338cfed19e..014b9e8bc4c4ccd206e6c4171b245c6f250f11ab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3103,6 +3103,10 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, } } +#ifdef CONFIG_NETLAT +extern void netlat_ack_check(struct sock *sk, struct sk_buff *skb); +extern void netlat_queue_check(struct sock *sk, struct sk_buff *skb); +#endif /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. @@ -3124,6 +3128,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u32 pkts_acked = 0; u32 last_in_flight = 0; bool rtt_update; + bool __maybe_unused netlat_oldest = true; int flag = 0; first_ackt = 0; @@ -3205,6 +3210,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; tcp_highest_sack_replace(sk, skb, next); +#ifdef CONFIG_NETLAT + /* + * here in for! we have make the ts of skb in rtx queue + * Monotonically incremental with the seq_num of skb,so + * here we can only report the oldest skb's latency. + * + * btw: the oldest skb's latency is the max latency see + * by this function + */ + if (netlat_oldest) { + netlat_ack_check(sk, skb); + netlat_oldest = false; + } +#endif tcp_rtx_queue_unlink_and_free(skb, sk); } @@ -4764,6 +4783,10 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); +#ifdef CONFIG_NETLAT + netlat_queue_check(sk, skb); +#endif + eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b8dd6d7cc91e00563db953d583d287f226b41a0c..c823f4ac218bc3a9ec0d66f1ba0faf83377efca5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -48,6 +48,11 @@ #include +#ifdef CONFIG_NETLAT +extern void netlat_copy_rtxq_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src); +extern void netlat_tcp_enrtxqueue(struct sock *sk, struct sk_buff *skb); +#endif + /* Refresh clocks of a TCP socket, * ensuring monotically increasing values. */ @@ -72,6 +77,12 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); +#ifdef CONFIG_NETLAT + /* + * for the tcp in established status, and normal data skb + */ + netlat_tcp_enrtxqueue(sk, skb); +#endif tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); if (tp->highest_sack == NULL) @@ -1505,9 +1516,28 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); - if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) + if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) { +#ifdef CONFIG_NETLAT + /* + * for skb in rtx queue and be splitted: + * + * eg: we receive an ack, the ack only partially + * acked an skb in rtx queue, we need split the + * partially acked skb, release the acked bytes + * and collect the remained bytes to `buff`, insert + * buff to rtx queue again + * some other condition like: + * partially sacked a skb in rtx queue + * partially dsacked a skb in rtx queue + * .... + * here we should copy the origin skb's ts to the + * new one(buff) + */ + netlat_copy_rtxq_skb(sk, buff, skb); +#endif list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); + } return 0; } @@ -3386,6 +3416,14 @@ int tcp_send_synack(struct sock *sk) tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); +#ifdef CONFIG_NETLAT + /* + * for crossed SYN-ACK, eg: we are in + * syn-send status, and received a pure + * syn skb from the peer + */ + netlat_tcp_enrtxqueue(sk, nskb); +#endif tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); @@ -3694,6 +3732,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); +#ifdef CONFIG_NETLAT + /* + * for fastopen sock which send data in the syn skb + */ + netlat_tcp_enrtxqueue(sk, syn_data); +#endif tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; @@ -3745,6 +3789,10 @@ int tcp_connect(struct sock *sk) tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); +#ifdef CONFIG_NETLAT + /* for the syn package */ + netlat_tcp_enrtxqueue(sk, buff); +#endif tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */