From 69e55430e9eb2f6df76f2709a744b388d69d567b Mon Sep 17 00:00:00 2001 From: Zhang Wensheng Date: Wed, 9 Mar 2022 16:43:21 +0800 Subject: [PATCH 001/340] block: add a switch for precise iostat accounting hulk inclusion category: bugfix bugzilla: 39265, https://gitee.com/openeuler/kernel/issues/I4WC06 CVE: NA ----------------------------------------------- When the inflight IOs are slow and no new IOs are issued, we expect iostat could manifest the IO hang problem. However after commit 9c6dea45e6f7 ("block: delete part_round_stats and switch to less precise counting"), io_tick and time_in_queue will not be updated until the end of IO, and the avgqu-sz and %util columns of iostat will be zero. To fix it, we could fallback to the implementation before commit 9c6dea45e6f7, but it may cause performance regression on NVMe device or bio-based device (due to overhead of inflight calculation), so add a switch to control whether or not to use precise iostat accounting. It can be enabled by adding "precise_iostat=1" in kernel boot cmdline. When precise accouting is enabled, io_tick and time_in_queue will be updated when accessing /proc/diskstats and /sys/block/sdX/sdXN/stat. Fixes: 9c6dea45e6f7 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Zhang Wensheng Reviewed-by: Jason Yan Signed-off-by: Yang Yingliang --- block/bio.c | 8 ++++++-- block/blk-core.c | 30 +++++++++++++++++++++++++++--- block/blk-merge.c | 2 ++ block/genhd.c | 7 +++++++ block/partition-generic.c | 8 ++++++++ include/linux/blkdev.h | 1 + 6 files changed, 51 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index d94243411ef3..b50d3b59c79b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1706,9 +1706,13 @@ void generic_end_io_acct(struct request_queue *q, int req_op, const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - update_io_ticks(cpu, part, now); + if (precise_iostat) { + part_round_stats(q, cpu, part); + } else { + update_io_ticks(cpu, part, now); + part_stat_add(cpu, part, time_in_queue, duration); + } part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_add(cpu, part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 41d0b09e9a67..df733e8caa6a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -56,6 +56,20 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); +bool precise_iostat; +static int __init precise_iostat_setup(char *str) +{ + bool precise; + + if (!strtobool(str, &precise)) { + precise_iostat = precise; + pr_info("precise iostat %d\n", precise_iostat); + } + + return 1; +} +__setup("precise_iostat=", precise_iostat_setup); + /* * For the allocated request tables */ @@ -1700,8 +1714,13 @@ static void part_round_stats_single(struct request_queue *q, int cpu, struct hd_struct *part, unsigned long now, unsigned int inflight) { - if (inflight) + if (inflight) { + if (precise_iostat) { + __part_stat_add(cpu, part, time_in_queue, + inflight * (now - part->stamp)); + } __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); + } part->stamp = now; } @@ -2771,10 +2790,15 @@ void blk_account_io_done(struct request *req, u64 now) cpu = part_stat_lock(); part = req->part; - update_io_ticks(cpu, part, jiffies); + if (!precise_iostat) { + update_io_ticks(cpu, part, jiffies); + part_stat_add(cpu, part, time_in_queue, + nsecs_to_jiffies64(now - req->start_time_ns)); + } else { + part_round_stats(req->q, cpu, part); + } part_stat_inc(cpu, part, ios[sgrp]); part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); - part_stat_add(cpu, part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4c17c1031e34..d2fabe1fdf32 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -669,6 +669,8 @@ static void blk_account_io_merge(struct request *req) cpu = part_stat_lock(); part = req->part; + if (precise_iostat) + part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); diff --git a/block/genhd.c b/block/genhd.c index 183612cbbd6b..e7b97fdb4173 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1352,6 +1352,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight[2]; + int cpu; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1363,6 +1364,12 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { + if (precise_iostat) { + cpu = part_stat_lock(); + part_round_stats(gp->queue, cpu, hd); + part_stat_unlock(); + } + part_in_flight(gp->queue, hd, inflight); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " diff --git a/block/partition-generic.c b/block/partition-generic.c index 739c0cc5fd22..c4ac7a8c77dc 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "partitions/check.h" @@ -121,6 +122,13 @@ ssize_t part_stat_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; unsigned int inflight[2]; + int cpu; + + if (precise_iostat) { + cpu = part_stat_lock(); + part_round_stats(q, cpu, p); + part_stat_unlock(); + } part_in_flight(q, p, inflight); return sprintf(buf, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 28ea02865ecc..a86659e78d98 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -28,6 +28,7 @@ #include #include +extern bool precise_iostat; struct module; struct scsi_ioctl_command; -- Gitee From 7165cb45a71e81414304ee7e15f12a5aa9e0ccdf Mon Sep 17 00:00:00 2001 From: Zhang Wensheng Date: Wed, 9 Mar 2022 16:43:22 +0800 Subject: [PATCH 002/340] block: account inflight from blk_account_io_start() if 'precise_iostat' is set hulk inclusion category: bugfix bugzilla: 39265, https://gitee.com/openeuler/kernel/issues/I4WC06 CVE: NA ----------------------------------------------- When 'precise_iostat' is set, io_tick and time_in_queue will be updated if io is inflight. However, mq will not account io as inflight until getting driver tag, while sq will account it after getting sched tag. On the other hand, if io scheduler is none, blk_mq_get_tag() will get driver tag directly, and such io will be accounted as inflight. The consequences is that 'io_tick' will be miscalculated in part_round_stats(). Thus revert to sq's implementation if 'precise_iostat' is set. Signed-off-by: Zhang Wensheng Reviewed-by: Jason Yan Signed-off-by: Yang Yingliang --- block/genhd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index e7b97fdb4173..fc24384e2c85 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -24,6 +24,7 @@ #include "blk.h" +extern bool precise_iostat; static DEFINE_MUTEX(block_class_lock); struct kobject *block_depr; @@ -47,7 +48,7 @@ static void disk_release_events(struct gendisk *disk); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (!precise_iostat && q->mq_ops) return; atomic_inc(&part->in_flight[rw]); @@ -57,7 +58,7 @@ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (!precise_iostat && q->mq_ops) return; atomic_dec(&part->in_flight[rw]); @@ -68,7 +69,7 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) void part_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + if (!precise_iostat && q->mq_ops) { blk_mq_in_flight(q, part, inflight); return; } @@ -85,7 +86,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + if (!precise_iostat && q->mq_ops) { blk_mq_in_flight_rw(q, part, inflight); return; } -- Gitee From 98aa2e7ecb2b2ea862aa8b327d2c57521f89b289 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 9 Mar 2022 16:43:23 +0800 Subject: [PATCH 003/340] dm multipath: fix missing blk_account_io_done() in error path hulk inclusion category: bugfix bugzilla: 186143, https://gitee.com/openeuler/kernel/issues/I4WC06 CVE: NA ----------------------------------------------- If precise_iostat is enabled, inflight will be recorded and cleared by atomic operations. However, for dm multipath, inflight will be recorded by dm_requeue_original_request(), and if some error happened, multipath_release_clone() won't clear inflight, which will cause inflight to leak. Furthermore, %util will always be 100 in iostat. Fix the problem by calling __blk_mq_end_request() in multipath_release_clone() instead. Signed-off-by: Yu Kuai Signed-off-by: Zhang Wensheng Reviewed-by: Jason Yan Signed-off-by: Yang Yingliang --- block/blk-core.c | 22 ++++++++++++++++++++-- block/blk.h | 9 +++++++++ drivers/md/dm-rq.c | 2 +- include/linux/blkdev.h | 2 ++ 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index df733e8caa6a..a5d80ab91170 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2669,8 +2669,10 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @q: the queue to submit the request * @rq: the request being queued + * @precise: true if io account with start and done will be balanced */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +blk_status_t __blk_insert_cloned_request(struct request_queue *q, + struct request *rq, bool precise) { unsigned long flags; int where = ELEVATOR_INSERT_BACK; @@ -2693,7 +2695,16 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_request_issue_directly(rq); + ret = blk_mq_request_issue_directly(rq); + if (ret && precise) { + u64 now = 0; + + if (blk_mq_need_time_stamp(rq)) + now = ktime_get_ns(); + + blk_account_io_done(rq, now); + } + return ret; } spin_lock_irqsave(q->queue_lock, flags); @@ -2718,6 +2729,13 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return BLK_STS_OK; } +EXPORT_SYMBOL_GPL(__blk_insert_cloned_request); + +blk_status_t blk_insert_cloned_request(struct request_queue *q, + struct request *rq) +{ + return __blk_insert_cloned_request(q, rq, false); +} EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** diff --git a/block/blk.h b/block/blk.h index e496e26630f7..dde2141a32dd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -405,6 +405,15 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) return current->io_context; } +/* + * Only need start/end time stamping if we have stats enabled, or using + * an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; +} + /* * Internal throttling interface */ diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 80683f2b723d..3bd805f7ce85 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -410,7 +410,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ clone->rq_flags |= RQF_IO_STAT; clone->start_time_ns = ktime_get_ns(); - r = blk_insert_cloned_request(clone->q, clone); + r = __blk_insert_cloned_request(clone->q, clone, true); if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a86659e78d98..1deaf36eb237 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1012,6 +1012,8 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); +extern blk_status_t __blk_insert_cloned_request(struct request_queue *q, + struct request *rq, bool precise); extern int blk_rq_append_bio(struct request *rq, struct bio **bio); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **); -- Gitee From 9fefd49137cf42a0ba00fa98aa815ec209245840 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Mar 2022 10:43:36 +0800 Subject: [PATCH 004/340] io_uring: re-issue block requests that failed because of resources mainline inclusion from mainline-v5.9-rc1 commit b63534c41e20b474483b4ddf47efc858c17352e0 category: bugfix bugzilla: 186136, https://gitee.com/openeuler/kernel/issues/I4RM1D CVE: NA ------------------------------------------------- Mark the plug with nowait == true, which will cause requests to avoid blocking on request allocation. If they do, we catch them and reissue them from a task_work based handler. Normally we can catch -EAGAIN directly, but the hard case is for split requests. As an example, the application issues a 512KB request. The block core will split this into 128KB if that's the max size for the device. The first request issues just fine, but we run into -EAGAIN for some latter splits for the same request. As the bio is split, we don't get to see the -EAGAIN until one of the actual reads complete, and hence we cannot handle it inline as part of submission. This does potentially cause re-reads of parts of the range, as the whole request is reissued. There's currently no better way to handle this. Signed-off-by: Jens Axboe conflict: fs/io_uring.c Adding nowait to plug list is reverted in (62c774ed4831 ("io_uring: don't unconditionally set plug->nowait = true")) Signed-off-by: Laibin Qiu Reviewed-by: Zhang Yi --- fs/io_uring.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f933d4f0edb4..aeedf191b813 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -917,6 +917,13 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_comp_state *cs); +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock); +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -2363,10 +2370,90 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, __io_req_complete(req, res, cflags, cs); } +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req, int error) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + ssize_t ret = -ECANCELED; + struct iov_iter iter; + int rw; + + if (error) { + ret = error; + goto end_req; + } + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + rw = READ; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + rw = WRITE; + break; + default: + printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", + req->opcode); + goto end_req; + } + + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + if (!ret) + return true; + kfree(iovec); +end_req: + io_cqring_add_event(req, ret, 0); + req_set_fail_links(req); + io_put_req(req); + return false; +} + +static void io_rw_resubmit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; + int err; + + __set_current_state(TASK_RUNNING); + + err = io_sq_thread_acquire_mm_files(ctx, req); + + if (io_resubmit_prep(req, err)) { + refcount_inc(&req->refs); + io_queue_async_work(req); + } +} +#endif + +static bool io_rw_reissue(struct io_kiocb *req, long res) +{ +#ifdef CONFIG_BLOCK + struct task_struct *tsk; + int ret; + + if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + return false; + + tsk = req->task; + init_task_work(&req->task_work, io_rw_resubmit); + ret = task_work_add(tsk, &req->task_work, true); + if (!ret) + return true; +#endif + return false; +} + static void __io_complete_rw(struct io_kiocb *req, long res, long res2, struct io_comp_state *cs) { - io_complete_rw_common(&req->rw.kiocb, res, cs); + if (!io_rw_reissue(req, res)) + io_complete_rw_common(&req->rw.kiocb, res, cs); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -2536,6 +2623,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; + if (kiocb->ki_flags & IOCB_DIRECT) + io_get_req_task(req); + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; @@ -3037,6 +3127,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2; if (req->file->f_op->read_iter) @@ -3054,6 +3145,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, goto copy_iov; kiocb_done(kiocb, ret2, cs); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -3120,6 +3213,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2; /* @@ -3157,6 +3251,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, goto copy_iov; kiocb_done(kiocb, ret2, cs); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); -- Gitee From 7448b91d9beaa37a768e6460c57938f92ade9706 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 11 Mar 2022 10:43:37 +0800 Subject: [PATCH 005/340] block: don't ignore REQ_NOWAIT for direct IO mainline inclusion from mainline-v5.12-rc6 commit f8b78caf21d5bc3fcfc40c18898f9d52ed1451a5 category: bugfix bugzilla: 186136, https://gitee.com/openeuler/kernel/issues/I4RM1D CVE: NA ------------------------------------------------- If IOCB_NOWAIT is set on submission, then that needs to get propagated to REQ_NOWAIT on the block side. Otherwise we completely lose this information, and any issuer of IOCB_NOWAIT IO will potentially end up blocking on eg request allocation on the storage side. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe conflict: fs/block_dev.c Signed-off-by: Laibin Qiu Reviewed-by: Zhang Yi --- fs/block_dev.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index 8b299347f2aa..9868b21b8ef9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -253,6 +253,9 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; + qc = submit_bio(&bio); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -407,6 +410,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; -- Gitee From 50fb507af014891e0fb2ecbeda7cb40cdcdaede1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 11 Mar 2022 10:43:38 +0800 Subject: [PATCH 006/340] io_uring: remove redundant initialization of variable ret mainline inclusion from mainline-v5.12-rc1 commit 4a245479c2312e6b51862c21af134d4191ab9cf7 category: bugfix bugzilla: 186136, https://gitee.com/openeuler/kernel/issues/I4RM1D CVE: NA ------------------------------------------------- The variable ret is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Fixes: b63534c41e20 ("io_uring: re-issue block requests that failed because of resources") Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe conflict: fs/io_uring.c Signed-off-by: Laibin Qiu Reviewed-by: Zhang Yi --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index aeedf191b813..4e552dfe1c64 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2374,7 +2374,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, static bool io_resubmit_prep(struct io_kiocb *req, int error) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - ssize_t ret = -ECANCELED; + ssize_t ret; struct iov_iter iter; int rw; -- Gitee From 867ba1fd06f411f1ceaa33e978512803b48a722e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Mar 2022 10:43:39 +0800 Subject: [PATCH 007/340] io_uring: don't double complete failed reissue request mainline inclusion from mainline-v5.10-rc5 commit c993df5a688975bf9ce899706ca13d2bc8d6be25 category: bugfix bugzilla: 186136, https://gitee.com/openeuler/kernel/issues/I4RM1D CVE: NA ------------------------------------------------- Zorro reports that an xfstest test case is failing, and it turns out that for the reissue path we can potentially issue a double completion on the request for the failure path. There's an issue around the retry as well, but for now, at least just make sure that we handle the error path correctly. Cc: stable@vger.kernel.org Fixes: b63534c41e20 ("io_uring: re-issue block requests that failed because of resources") Reported-by: Zorro Lang Signed-off-by: Jens Axboe conflict: fs/io_uring.c Change based on e1e16097e265 ("io_uring: provide generic io_req_complete() helper") Signed-off-by: Laibin Qiu Reviewed-by: Zhang Yi --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4e552dfe1c64..bdcac452b174 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2408,9 +2408,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) return true; kfree(iovec); end_req: - io_cqring_add_event(req, ret, 0); req_set_fail_links(req); - io_put_req(req); return false; } -- Gitee From f7a31280eeb428b156d5d48604220f1ee84a1cb5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Mar 2022 10:43:40 +0800 Subject: [PATCH 008/340] io_uring: don't re-setup vecs/iter in io_resumit_prep() is already there mainline inclusion from mainline-v5.9-rc7 commit 8f3d749685e48c44dbe877ac9781079d85f914c8 category: bugfix bugzilla: 186136, https://gitee.com/openeuler/kernel/issues/I4RM1D CVE: NA ------------------------------------------------- If we already have mapped the necessary data for retry, then don't set it up again. It's a pointless operation, and we leak the iovec if it's a large (non-stack) vec. Fixes: b63534c41e20 ("io_uring: re-issue block requests that failed because of resources") Signed-off-by: Jens Axboe conflict: fs/io_uring.c Signed-off-by: Laibin Qiu Reviewed-by: Zhang Yi --- fs/io_uring.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bdcac452b174..7240b5423170 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2400,13 +2400,17 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) goto end_req; } - ret = io_import_iovec(rw, req, &iovec, &iter, false); - if (ret < 0) - goto end_req; - ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); - if (!ret) + if (!req->io) { + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + if (!ret) + return true; + kfree(iovec); + } else { return true; - kfree(iovec); + } end_req: req_set_fail_links(req); return false; -- Gitee From 2154471020979d12998292050eddf0e066b2a57d Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Fri, 11 Mar 2022 11:30:09 +0800 Subject: [PATCH 009/340] sched: Fix sleeping in atomic context at cpu_qos_write() hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WOPM CVE: NA -------------------------------- cfs_bandwidth_usage_inc() need hold jump_label_mutex and might sleep, so we can not call it in atomic context. Fix this by moving cfs_bandwidth_usage_{inc,dec}() out of rcu read critical section. Fixes: f7b390cd929 ("sched: Change cgroup task scheduler policy") Signed-off-by: Zhang Qiao Reviewed-by: Chen Hui Reviewed-by: Wang Weiyang Signed-off-by: Yang Yingliang Signed-off-by: Laibin Qiu --- kernel/sched/core.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b36a3b4c60e9..496ce71f93a7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6981,13 +6981,10 @@ static int tg_change_scheduler(struct task_group *tg, void *data) struct cgroup_subsys_state *css = &tg->css; tg->qos_level = qos_level; - if (qos_level == -1) { + if (qos_level == -1) policy = SCHED_IDLE; - cfs_bandwidth_usage_inc(); - } else { + else policy = SCHED_NORMAL; - cfs_bandwidth_usage_dec(); - } param.sched_priority = 0; css_task_iter_start(css, 0, &it); @@ -7015,6 +7012,13 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, if (tg->qos_level == -1 && qos_level == 0) return -EINVAL; + cpus_read_lock(); + if (qos_level == -1) + cfs_bandwidth_usage_inc(); + else + cfs_bandwidth_usage_dec(); + cpus_read_unlock(); + rcu_read_lock(); walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); rcu_read_unlock(); -- Gitee From 5b4e1d322beedfe8abfbc60ccf85a6bf46c1cde9 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 11 Mar 2022 16:31:11 +0800 Subject: [PATCH 010/340] arm64: acpi: fix UBSAN warning mainline inclusion from mainline-v5.8-rc1 commit a194c33f45f83068ef13bf1d16e26d4ca3ecc098 category: bugfix bugzilla: 89150 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a194c33f45f83068ef13bf1d16e26d4ca3ecc098 -------------------------------- Will reported a UBSAN warning: UBSAN: null-ptr-deref in arch/arm64/kernel/smp.c:596:6 member access within null pointer of type 'struct acpi_madt_generic_interrupt' CPU: 0 PID: 0 Comm: swapper Not tainted 5.7.0-rc6-00124-g96bc42ff0a82 #1 Call trace: dump_backtrace+0x0/0x384 show_stack+0x28/0x38 dump_stack+0xec/0x174 handle_null_ptr_deref+0x134/0x174 __ubsan_handle_type_mismatch_v1+0x84/0xa4 acpi_parse_gic_cpu_interface+0x60/0xe8 acpi_parse_entries_array+0x288/0x498 acpi_table_parse_entries_array+0x178/0x1b4 acpi_table_parse_madt+0xa4/0x110 acpi_parse_and_init_cpus+0x38/0x100 smp_init_cpus+0x74/0x258 setup_arch+0x350/0x3ec start_kernel+0x98/0x6f4 This is from the use of the ACPI_OFFSET in arch/arm64/include/asm/acpi.h. Replace its use with offsetof from include/linux/stddef.h which should implement the same logic using __builtin_offsetof, so that UBSAN wont warn. Reported-by: Will Deacon Suggested-by: Ard Biesheuvel Signed-off-by: Nick Desaulniers Reviewed-by: Jeremy Linton Acked-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/lkml/20200521100952.GA5360@willie-the-truck/ Link: https://lore.kernel.org/r/20200608203818.189423-1-ndesaulniers@google.com Signed-off-by: Will Deacon Signed-off-by: linyujun Reviewed-by: chenlifu Reviewed-by: He Ying Signed-off-by: Laibin Qiu --- arch/arm64/include/asm/acpi.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index f364115ac2ef..348f12447ecd 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -33,14 +34,14 @@ * is therefore used to delimit the MADT GICC structure minimum length * appropriately. */ -#define ACPI_MADT_GICC_MIN_LENGTH ACPI_OFFSET( \ +#define ACPI_MADT_GICC_MIN_LENGTH offsetof( \ struct acpi_madt_generic_interrupt, efficiency_class) #define BAD_MADT_GICC_ENTRY(entry, end) \ (!(entry) || (entry)->header.length < ACPI_MADT_GICC_MIN_LENGTH || \ (unsigned long)(entry) + (entry)->header.length > (end)) -#define ACPI_MADT_GICC_SPE (ACPI_OFFSET(struct acpi_madt_generic_interrupt, \ +#define ACPI_MADT_GICC_SPE (offsetof(struct acpi_madt_generic_interrupt, \ spe_interrupt) + sizeof(u16)) /* Basic configuration for ACPI */ -- Gitee From fd9502211efce477cf42ba9c84f563c853479fa8 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 11 Mar 2022 16:54:34 +0800 Subject: [PATCH 011/340] net: hns3: fix pf vlan filter out of work after self test driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4W90Z CVE: NA ---------------------------- After a self test, the vlan filter of PF is wrongly closed, cause the vlan filter out of work. The second parameter of enable_vlan_filter() must pass true, and in enable_vlan_filter(), it will decide whether to open or keep the status of vlan filter. Fixes: 79549957e66f ("Revert: net: hns3: adds support for extended VLAN mode and 'QOS' in vlan 802.1Q protocol.") Signed-off-by: Yonglong Liu Reviewed-by: li yongxin Signed-off-by: Laibin Qiu --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index ef870da7a8a4..be771c75e37e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -395,8 +395,7 @@ static void hns3_self_test(struct net_device *ndev, #if IS_ENABLED(CONFIG_VLAN_8021Q) if (dis_vlan_filter) - h->ae_algo->ops->enable_vlan_filter(h, - ndev->flags & IFF_PROMISC); + h->ae_algo->ops->enable_vlan_filter(h, true); #endif if (if_running) -- Gitee From 70cd9d1e519bf3685b97adda9bbfa2b98790aef5 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 11 Mar 2022 16:54:35 +0800 Subject: [PATCH 012/340] net: hns3: fix RMW issue for VLAN filter switch mainline inclusion from mainline-v5.6-rc6 commit 903b85d3adce99a5301d5959c4d3c9d14a7974d4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4W96A CVE: NA ---------------------------- According to the user manual, the ingress and egress VLAN filter are configured at the same time. Currently, hclge_init_vlan_config() and hclge_set_vlan_spoofchk() will both change the VLAN filter switch. So it's necessary to read the old configuration before modifying it. Fixes: fea4e7dc2790 ("net: hns3: add support for spoof check setting") Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller Signed-off-by: Yonglong Liu Reviewed-by: li yongxin Signed-off-by: Laibin Qiu --- .../hisilicon/hns3/hns3pf/hclge_main.c | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index b4c8f410e6f8..ca11b49c93a2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8324,16 +8324,30 @@ static int hclge_set_vlan_filter_ctrl(struct hclge_dev *hdev, u8 vlan_type, struct hclge_desc desc; int ret; - hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_VLAN_FILTER_CTRL, false); + /* read current vlan filter parameter */ + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_VLAN_FILTER_CTRL, true); req = (struct hclge_vlan_filter_ctrl_cmd *)desc.data; req->vlan_type = vlan_type; - req->vlan_fe = filter_en ? fe_type : 0; req->vf_id = vf_id; + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get vport%u vlan filter config, ret = %d.\n", + vf_id, ret); + return ret; + } + + /* modify and write new config parameter */ + hclge_cmd_reuse_desc(&desc, false); + req->vlan_fe = filter_en ? + (req->vlan_fe | fe_type) : (req->vlan_fe & ~fe_type); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) dev_err(&hdev->pdev->dev, - "set vlan filter fail, ret =%d.\n", ret); + "failed to set vport%u vlan filter, ret = %d.\n", + vf_id, ret); return ret; } -- Gitee From 7faf1f777a2a17fa874fb243f442e24115da69c0 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 11 Mar 2022 16:54:36 +0800 Subject: [PATCH 013/340] net: hns3: update hns3 version to 22.2.1 driver inclusion category: bugfix bugzilla: NA CVE: NA ---------------------------- Signed-off-by: Yonglong Liu Reviewed-by: li yongxin Signed-off-by: Laibin Qiu --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index aebf0d272251..f8131e4e343a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -32,7 +32,7 @@ #include #include -#define HNAE3_MOD_VERSION "21.12.4" +#define HNAE3_MOD_VERSION "22.2.1" #define HNAE3_MIN_VECTOR_NUM 2 /* first one for misc, another for IO */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h b/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h index f3a905252deb..a52b964d953c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h @@ -4,7 +4,7 @@ #ifndef __HNS3_CAE_VERSION_H__ #define __HNS3_CAE_VERSION_H__ -#define HNS3_CAE_MOD_VERSION "21.12.4" +#define HNS3_CAE_MOD_VERSION "22.2.1" #define CMT_ID_LEN 8 #define RESV_LEN 3 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index da140a3d568d..df7b9d3434ad 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -8,7 +8,7 @@ #include "hnae3.h" -#define HNS3_MOD_VERSION "21.12.4" +#define HNS3_MOD_VERSION "22.2.1" extern char hns3_driver_version[]; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 81451ccb4140..027d9137f76d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -12,7 +12,7 @@ #include "hclge_cmd.h" #include "hnae3.h" -#define HCLGE_MOD_VERSION "21.12.4" +#define HCLGE_MOD_VERSION "22.2.1" #define HCLGE_DRIVER_NAME "hclge" #define HCLGE_MAX_PF_NUM 8 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 6880c4cb52f3..b2fd2a154625 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -10,7 +10,7 @@ #include "hclgevf_cmd.h" #include "hnae3.h" -#define HCLGEVF_MOD_VERSION "21.12.4" +#define HCLGEVF_MOD_VERSION "22.2.1" #define HCLGEVF_DRIVER_NAME "hclgevf" #define HCLGEVF_MAX_VLAN_ID 4095 -- Gitee From 69bb4bb982ad84140780b7cd5aa4498be6706e83 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 14 Mar 2022 09:57:23 +0800 Subject: [PATCH 014/340] net: skip virtio_net_hdr_set_proto if protocol already set stable inclusion from linux-4.19.223 commit 5fefa884a55bd5995e77f4fdd89f0400f42b39f7 -------------------------------- [ Upstream commit 1ed1d592113959f00cc552c3b9f47ca2d157768f ] virtio_net_hdr_set_proto infers skb->protocol from the virtio_net_hdr gso_type, to avoid packets getting dropped for lack of a proto type. Its protocol choice is a guess, especially in the case of UFO, where the single VIRTIO_NET_HDR_GSO_UDP label covers both UFOv4 and UFOv6. Skip this best effort if the field is already initialized. Whether explicitly from userspace, or implicitly based on an earlier call to dev_parse_header_protocol (which is more robust, but was introduced after this patch). Fixes: 9d2f67e43b73 ("net/packet: fix packet drop as of virtio gso") Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20211220145027.2784293-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- include/linux/virtio_net.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 3e2ee7739c11..a7c197299fc7 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -10,6 +10,9 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { + if (skb->protocol) + return 0; + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: -- Gitee From a97abb624a6719f9834e68f0920e832ce3963e9b Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Mon, 14 Mar 2022 09:57:24 +0800 Subject: [PATCH 015/340] ipmi: Fix UAF when uninstall ipmi_si and ipmi_msghandler module stable inclusion from linux-4.19.223 commit 925229d552724e1bba1abf01d3a0b1318539b012 -------------------------------- [ Upstream commit ffb76a86f8096a8206be03b14adda6092e18e275 ] Hi, When testing install and uninstall of ipmi_si.ko and ipmi_msghandler.ko, the system crashed. The log as follows: [ 141.087026] BUG: unable to handle kernel paging request at ffffffffc09b3a5a [ 141.087241] PGD 8fe4c0d067 P4D 8fe4c0d067 PUD 8fe4c0f067 PMD 103ad89067 PTE 0 [ 141.087464] Oops: 0010 [#1] SMP NOPTI [ 141.087580] CPU: 67 PID: 668 Comm: kworker/67:1 Kdump: loaded Not tainted 4.18.0.x86_64 #47 [ 141.088009] Workqueue: events 0xffffffffc09b3a40 [ 141.088009] RIP: 0010:0xffffffffc09b3a5a [ 141.088009] Code: Bad RIP value. [ 141.088009] RSP: 0018:ffffb9094e2c3e88 EFLAGS: 00010246 [ 141.088009] RAX: 0000000000000000 RBX: ffff9abfdb1f04a0 RCX: 0000000000000000 [ 141.088009] RDX: 0000000000000000 RSI: 0000000000000246 RDI: 0000000000000246 [ 141.088009] RBP: 0000000000000000 R08: ffff9abfffee3cb8 R09: 00000000000002e1 [ 141.088009] R10: ffffb9094cb73d90 R11: 00000000000f4240 R12: ffff9abfffee8700 [ 141.088009] R13: 0000000000000000 R14: ffff9abfdb1f04a0 R15: ffff9abfdb1f04a8 [ 141.088009] FS: 0000000000000000(0000) GS:ffff9abfffec0000(0000) knlGS:0000000000000000 [ 141.088009] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 141.088009] CR2: ffffffffc09b3a30 CR3: 0000008fe4c0a001 CR4: 00000000007606e0 [ 141.088009] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 141.088009] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 141.088009] PKRU: 55555554 [ 141.088009] Call Trace: [ 141.088009] ? process_one_work+0x195/0x390 [ 141.088009] ? worker_thread+0x30/0x390 [ 141.088009] ? process_one_work+0x390/0x390 [ 141.088009] ? kthread+0x10d/0x130 [ 141.088009] ? kthread_flush_work_fn+0x10/0x10 [ 141.088009] ? ret_from_fork+0x35/0x40] BUG: unable to handle kernel paging request at ffffffffc0b28a5a [ 200.223240] PGD 97fe00d067 P4D 97fe00d067 PUD 97fe00f067 PMD a580cbf067 PTE 0 [ 200.223464] Oops: 0010 [#1] SMP NOPTI [ 200.223579] CPU: 63 PID: 664 Comm: kworker/63:1 Kdump: loaded Not tainted 4.18.0.x86_64 #46 [ 200.224008] Workqueue: events 0xffffffffc0b28a40 [ 200.224008] RIP: 0010:0xffffffffc0b28a5a [ 200.224008] Code: Bad RIP value. [ 200.224008] RSP: 0018:ffffbf3c8e2a3e88 EFLAGS: 00010246 [ 200.224008] RAX: 0000000000000000 RBX: ffffa0799ad6bca0 RCX: 0000000000000000 [ 200.224008] RDX: 0000000000000000 RSI: 0000000000000246 RDI: 0000000000000246 [ 200.224008] RBP: 0000000000000000 R08: ffff9fe43fde3cb8 R09: 00000000000000d5 [ 200.224008] R10: ffffbf3c8cb53d90 R11: 00000000000f4240 R12: ffff9fe43fde8700 [ 200.224008] R13: 0000000000000000 R14: ffffa0799ad6bca0 R15: ffffa0799ad6bca8 [ 200.224008] FS: 0000000000000000(0000) GS:ffff9fe43fdc0000(0000) knlGS:0000000000000000 [ 200.224008] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 200.224008] CR2: ffffffffc0b28a30 CR3: 00000097fe00a002 CR4: 00000000007606e0 [ 200.224008] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 200.224008] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 200.224008] PKRU: 55555554 [ 200.224008] Call Trace: [ 200.224008] ? process_one_work+0x195/0x390 [ 200.224008] ? worker_thread+0x30/0x390 [ 200.224008] ? process_one_work+0x390/0x390 [ 200.224008] ? kthread+0x10d/0x130 [ 200.224008] ? kthread_flush_work_fn+0x10/0x10 [ 200.224008] ? ret_from_fork+0x35/0x40 [ 200.224008] kernel fault(0x1) notification starting on CPU 63 [ 200.224008] kernel fault(0x1) notification finished on CPU 63 [ 200.224008] CR2: ffffffffc0b28a5a [ 200.224008] ---[ end trace c82a412d93f57412 ]--- The reason is as follows: T1: rmmod ipmi_si. ->ipmi_unregister_smi() -> ipmi_bmc_unregister() -> __ipmi_bmc_unregister() -> kref_put(&bmc->usecount, cleanup_bmc_device); -> schedule_work(&bmc->remove_work); T2: rmmod ipmi_msghandler. ipmi_msghander module uninstalled, and the module space will be freed. T3: bmc->remove_work doing cleanup the bmc resource. -> cleanup_bmc_work() -> platform_device_unregister(&bmc->pdev); -> platform_device_del(pdev); -> device_del(&pdev->dev); -> kobject_uevent(&dev->kobj, KOBJ_REMOVE); -> kobject_uevent_env() -> dev_uevent() -> if (dev->type && dev->type->name) 'dev->type'(bmc_device_type) pointer space has freed when uninstall ipmi_msghander module, 'dev->type->name' cause the system crash. drivers/char/ipmi/ipmi_msghandler.c: 2820 static const struct device_type bmc_device_type = { 2821 .groups = bmc_dev_attr_groups, 2822 }; Steps to reproduce: Add a time delay in cleanup_bmc_work() function, and uninstall ipmi_si and ipmi_msghandler module. 2910 static void cleanup_bmc_work(struct work_struct *work) 2911 { 2912 struct bmc_device *bmc = container_of(work, struct bmc_device, 2913 remove_work); 2914 int id = bmc->pdev.id; /* Unregister overwrites id */ 2915 2916 msleep(3000); <--- 2917 platform_device_unregister(&bmc->pdev); 2918 ida_simple_remove(&ipmi_bmc_ida, id); 2919 } Use 'remove_work_wq' instead of 'system_wq' to solve this issues. Fixes: b2cfd8ab4add ("ipmi: Rework device id and guid handling to catch changing BMCs") Signed-off-by: Wu Bo Message-Id: <1640070034-56671-1-git-send-email-wubo40@huawei.com> Signed-off-by: Corey Minyard Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/char/ipmi/ipmi_msghandler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 272c34102875..19c33a86c1b8 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -2863,7 +2863,7 @@ cleanup_bmc_device(struct kref *ref) * with removing the device attributes while reading a device * attribute. */ - schedule_work(&bmc->remove_work); + queue_work(remove_work_wq, &bmc->remove_work); } /* -- Gitee From 4e06cf778e884ac021ec2ff1f42e98f4d893ac5d Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Mon, 14 Mar 2022 09:57:25 +0800 Subject: [PATCH 016/340] bonding: fix ad_actor_system option setting to default stable inclusion from linux-4.19.223 commit 4e4591490f8a930e11131514db4cce4201ffc8a5 -------------------------------- [ Upstream commit 1c15b05baea71a5ff98235783e3e4ad227760876 ] When 802.3ad bond mode is configured the ad_actor_system option is set to "00:00:00:00:00:00". But when trying to set the all-zeroes MAC as actors' system address it was failing with EINVAL. An all-zeroes ethernet address is valid, only multicast addresses are not valid values. Fixes: 171a42c38c6e ("bonding: add netlink support for sys prio, actor sys mac, and port key") Signed-off-by: Fernando Fernandez Mancera Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/20211221111345.2462-1-ffmancera@riseup.net Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- Documentation/networking/bonding.txt | 11 ++++++----- drivers/net/bonding/bond_options.c | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index e3abfbd32f71..b020e6ce6dd4 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -191,11 +191,12 @@ ad_actor_sys_prio ad_actor_system In an AD system, this specifies the mac-address for the actor in - protocol packet exchanges (LACPDUs). The value cannot be NULL or - multicast. It is preferred to have the local-admin bit set for this - mac but driver does not enforce it. If the value is not given then - system defaults to using the masters' mac address as actors' system - address. + protocol packet exchanges (LACPDUs). The value cannot be a multicast + address. If the all-zeroes MAC is specified, bonding will internally + use the MAC of the bond itself. It is preferred to have the + local-admin bit set for this mac but driver does not enforce it. If + the value is not given then system defaults to using the masters' + mac address as actors' system address. This parameter has effect only in 802.3ad mode and is available through SysFs interface. diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 7e07e213f4c2..1745cba04c18 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1458,7 +1458,7 @@ static int bond_option_ad_actor_system_set(struct bonding *bond, mac = (u8 *)&newval->value; } - if (!is_valid_ether_addr(mac)) + if (is_multicast_ether_addr(mac)) goto err; netdev_dbg(bond->dev, "Setting ad_actor_system to %pM\n", mac); -- Gitee From fe5ad016b1b642ab450f61a689b0e943139fafe6 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Mon, 14 Mar 2022 09:57:26 +0800 Subject: [PATCH 017/340] ipmi: bail out if init_srcu_struct fails stable inclusion from linux-4.19.223 commit db0d90490674c297a2e15a1ca3e40cd6975c30dc -------------------------------- commit 2b5160b12091285c5aca45980f100a9294af7b04 upstream. In case, init_srcu_struct fails (because of memory allocation failure), we might proceed with the driver initialization despite srcu_struct not being entirely initialized. Fixes: 913a89f009d9 ("ipmi: Don't initialize anything in the core until something uses it") Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Corey Minyard Cc: stable@vger.kernel.org Message-Id: <20211217154410.1228673-1-cascardo@canonical.com> Signed-off-by: Corey Minyard Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/char/ipmi/ipmi_msghandler.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 19c33a86c1b8..9b66afeb1237 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -5083,7 +5083,9 @@ static int ipmi_init_msghandler(void) if (initialized) goto out; - init_srcu_struct(&ipmi_interfaces_srcu); + rv = init_srcu_struct(&ipmi_interfaces_srcu); + if (rv) + goto out; timer_setup(&ipmi_timer, ipmi_timeout, 0); mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES); -- Gitee From 12db569e13e64b94db22e806c79e75dc6ae4dc85 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Mon, 14 Mar 2022 09:57:27 +0800 Subject: [PATCH 018/340] ipmi: fix initialization when workqueue allocation fails stable inclusion from linux-4.19.223 commit eb84855d3e8799b67cdbadc7a5c53997cbfc3580 -------------------------------- commit 75d70d76cb7b927cace2cb34265d68ebb3306b13 upstream. If the workqueue allocation fails, the driver is marked as not initialized, and timer and panic_notifier will be left registered. Instead of removing those when workqueue allocation fails, do the workqueue initialization before doing it, and cleanup srcu_struct if it fails. Fixes: 1d49eb91e86e ("ipmi: Move remove_work to dedicated workqueue") Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Corey Minyard Cc: Ioanna Alifieraki Cc: stable@vger.kernel.org Message-Id: <20211217154410.1228673-2-cascardo@canonical.com> Signed-off-by: Corey Minyard Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/char/ipmi/ipmi_msghandler.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 9b66afeb1237..83eaab66d2f6 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -5087,20 +5087,23 @@ static int ipmi_init_msghandler(void) if (rv) goto out; - timer_setup(&ipmi_timer, ipmi_timeout, 0); - mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES); - - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - remove_work_wq = create_singlethread_workqueue("ipmi-msghandler-remove-wq"); if (!remove_work_wq) { pr_err("unable to create ipmi-msghandler-remove-wq workqueue"); rv = -ENOMEM; - goto out; + goto out_wq; } + timer_setup(&ipmi_timer, ipmi_timeout, 0); + mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES); + + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + initialized = true; +out_wq: + if (rv) + cleanup_srcu_struct(&ipmi_interfaces_srcu); out: mutex_unlock(&ipmi_interfaces_mutex); return rv; -- Gitee From 4d0a4dc8e07599df06cc2653e03132a30360c1df Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Mon, 14 Mar 2022 09:57:28 +0800 Subject: [PATCH 019/340] x86/pkey: Fix undefined behaviour with PKRU_WD_BIT stable inclusion from linux-4.19.223 commit 94cc1e8331973ee7a2b4336f7ba67f48c2877d0a -------------------------------- commit 57690554abe135fee81d6ac33cc94d75a7e224bb upstream. Both __pkru_allows_write() and arch_set_user_pkey_access() shift PKRU_WD_BIT (a signed constant) by up to 30 bits, hitting the sign bit. Use unsigned constants instead. Clearly pkey 15 has not been used in combination with UBSAN yet. Noticed by code inspection only. I can't actually provoke the compiler into generating incorrect logic as far as this shift is concerned. [ dhansen: add stable@ tag, plus minor changelog massaging, For anyone doing backports, these #defines were in arch/x86/include/asm/pgtable.h before 784a46618f6. ] Fixes: 33a709b25a76 ("mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys") Signed-off-by: Andrew Cooper Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20211216000856.4480-1-andrew.cooper3@citrix.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- arch/x86/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fe6f9e7f15bb..1d79485ae972 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1353,8 +1353,8 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) #endif #endif -#define PKRU_AD_BIT 0x1 -#define PKRU_WD_BIT 0x2 +#define PKRU_AD_BIT 0x1u +#define PKRU_WD_BIT 0x2u #define PKRU_BITS_PER_PKEY 2 static inline bool __pkru_allows_read(u32 pkru, u16 pkey) -- Gitee From cf75bcdb67f6da74b4af8abf05d669b7e3e0fe18 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 14 Mar 2022 09:57:29 +0800 Subject: [PATCH 020/340] selinux: initialize proto variable in selinux_ip_postroute_compat() stable inclusion from linux-4.19.224 commit c34a0b5b3bf01062eb5f2a47870c5ab57256bdfc -------------------------------- commit 732bc2ff080c447f8524f40c970c481f5da6eed3 upstream. Clang static analysis reports this warning hooks.c:5765:6: warning: 4th function call argument is an uninitialized value if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto)) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ selinux_parse_skb() can return ok without setting proto. The later call to selinux_xfrm_postroute_last() does an early check of proto and can return ok if the garbage proto value matches. So initialize proto. Cc: stable@vger.kernel.org Fixes: eef9b41622f2 ("selinux: cleanup selinux_xfrm_sock_rcv_skb() and selinux_xfrm_postroute_last()") Signed-off-by: Tom Rix [PM: typo/spelling and checkpatch.pl description fixes] Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- security/selinux/hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1b611c068669..25353b4975c9 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5799,7 +5799,7 @@ static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; - u8 proto; + u8 proto = 0; if (sk == NULL) return NF_ACCEPT; -- Gitee From 69563ca8ecb18b9536fd8e3e5512599dc58a04bc Mon Sep 17 00:00:00 2001 From: Coco Li Date: Mon, 14 Mar 2022 09:57:30 +0800 Subject: [PATCH 021/340] udp: using datalen to cap ipv6 udp max gso segments stable inclusion from linux-4.19.224 commit bbd9c7120c31c1aff6dcffdcce122773d163df72 -------------------------------- [ Upstream commit 736ef37fd9a44f5966e25319d08ff7ea99ac79e8 ] The max number of UDP gso segments is intended to cap to UDP_MAX_SEGMENTS, this is checked in udp_send_skb(). skb->len contains network and transport header len here, we should use only data len instead. This is the ipv6 counterpart to the below referenced commit, which missed the ipv6 change Fixes: 158390e45612 ("udp: using datalen to cap max gso segments") Signed-off-by: Coco Li Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20211223222441.2975883-1-lixiaoyan@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 581763d6df12..3ed5fe0055af 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1069,7 +1069,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -EINVAL; } - if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) { + if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; } -- Gitee From a69ad5a8f7bf6178a78f337fb49f97e663fd90fa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Mar 2022 09:57:31 +0800 Subject: [PATCH 022/340] net: fix use-after-free in tw_timer_handler stable inclusion from linux-4.19.224 commit a8e1944b44f94f5c5f530e434c5eaee787254566 -------------------------------- commit e22e45fc9e41bf9fcc1e92cfb78eb92786728ef0 upstream. A real world panic issue was found as follow in Linux 5.4. BUG: unable to handle page fault for address: ffffde49a863de28 PGD 7e6fe62067 P4D 7e6fe62067 PUD 7e6fe63067 PMD f51e064067 PTE 0 RIP: 0010:tw_timer_handler+0x20/0x40 Call Trace: call_timer_fn+0x2b/0x120 run_timer_softirq+0x1ef/0x450 __do_softirq+0x10d/0x2b8 irq_exit+0xc7/0xd0 smp_apic_timer_interrupt+0x68/0x120 apic_timer_interrupt+0xf/0x20 This issue was also reported since 2017 in the thread [1], unfortunately, the issue was still can be reproduced after fixing DCCP. The ipv4_mib_exit_net is called before tcp_sk_exit_batch when a net namespace is destroyed since tcp_sk_ops is registered befrore ipv4_mib_ops, which means tcp_sk_ops is in the front of ipv4_mib_ops in the list of pernet_list. There will be a use-after-free on net->mib.net_statistics in tw_timer_handler after ipv4_mib_exit_net if there are some inflight time-wait timers. This bug is not introduced by commit f2bf415cfed7 ("mib: add net to NET_ADD_STATS_BH") since the net_statistics is a global variable instead of dynamic allocation and freeing. Actually, commit 61a7e26028b9 ("mib: put net statistics on struct net") introduces the bug since it put net statistics on struct net and free it when net namespace is destroyed. Moving init_ipv4_mibs() to the front of tcp_init() to fix this bug and replace pr_crit() with panic() since continuing is meaningless when init_ipv4_mibs() fails. [1] https://groups.google.com/g/syzkaller/c/p1tn-_Kc6l4/m/smuL_FMAAgAJ?pli=1 Fixes: 61a7e26028b9 ("mib: put net statistics on struct net") Signed-off-by: Muchun Song Cc: Cong Wang Cc: Fam Zheng Cc: Link: https://lore.kernel.org/r/20211228104145.9426-1-songmuchun@bytedance.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/af_inet.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 14e10214cf87..84ce4f86bc19 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1955,6 +1955,10 @@ static int __init inet_init(void) ip_init(); + /* Initialise per-cpu ipv4 mibs */ + if (init_ipv4_mibs()) + panic("%s: Cannot init ipv4 mibs\n", __func__); + /* Setup TCP slab cache for open requests. */ tcp_init(); @@ -1985,12 +1989,6 @@ static int __init inet_init(void) if (init_inet_pernet_ops()) pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); - /* - * Initialise per-cpu ipv4 mibs - */ - - if (init_ipv4_mibs()) - pr_crit("%s: Cannot init ipv4 mibs\n", __func__); ipv4_proc_init(); -- Gitee From 83dc9c3336ca51b73430e2d3c7fd4409cc4949bb Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 14 Mar 2022 09:57:32 +0800 Subject: [PATCH 023/340] tracing: Fix check for trace_percpu_buffer validity in get_trace_buf() stable inclusion from linux-4.19.225 commit 24bb91f9536f79a13f85cfaae59be41fc8d9380d -------------------------------- commit 823e670f7ed616d0ce993075c8afe0217885f79d upstream. With the new osnoise tracer, we are seeing the below splat: Kernel attempted to read user page (c7d880000) - exploit attempt? (uid: 0) BUG: Unable to handle kernel data access on read at 0xc7d880000 Faulting instruction address: 0xc0000000002ffa10 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries ... NIP [c0000000002ffa10] __trace_array_vprintk.part.0+0x70/0x2f0 LR [c0000000002ff9fc] __trace_array_vprintk.part.0+0x5c/0x2f0 Call Trace: [c0000008bdd73b80] [c0000000001c49cc] put_prev_task_fair+0x3c/0x60 (unreliable) [c0000008bdd73be0] [c000000000301430] trace_array_printk_buf+0x70/0x90 [c0000008bdd73c00] [c0000000003178b0] trace_sched_switch_callback+0x250/0x290 [c0000008bdd73c90] [c000000000e70d60] __schedule+0x410/0x710 [c0000008bdd73d40] [c000000000e710c0] schedule+0x60/0x130 [c0000008bdd73d70] [c000000000030614] interrupt_exit_user_prepare_main+0x264/0x270 [c0000008bdd73de0] [c000000000030a70] syscall_exit_prepare+0x150/0x180 [c0000008bdd73e10] [c00000000000c174] system_call_vectored_common+0xf4/0x278 osnoise tracer on ppc64le is triggering osnoise_taint() for negative duration in get_int_safe_duration() called from trace_sched_switch_callback()->thread_exit(). The problem though is that the check for a valid trace_percpu_buffer is incorrect in get_trace_buf(). The check is being done after calculating the pointer for the current cpu, rather than on the main percpu pointer. Fix the check to be against trace_percpu_buffer. Link: https://lkml.kernel.org/r/a920e4272e0b0635cf20c444707cbce1b2c8973d.1640255304.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: e2ace001176dc9 ("tracing: Choose static tp_printk buffer by explicit nesting count") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f20a658dabb..5bd5eff5e0e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2796,7 +2796,7 @@ static char *get_trace_buf(void) { struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!buffer || buffer->nesting >= 4) + if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; buffer->nesting++; -- Gitee From 22b3d9deb55b4479c4fa029b9bf29f28faba3d68 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 14 Mar 2022 09:57:33 +0800 Subject: [PATCH 024/340] tracing: Tag trace_percpu_buffer as a percpu pointer stable inclusion from linux-4.19.225 commit e9c3f28e2b9eaff371db65e4bd7ee170616147b4 -------------------------------- commit f28439db470cca8b6b082239314e9fd10bd39034 upstream. Tag trace_percpu_buffer as a percpu pointer to resolve warnings reported by sparse: /linux/kernel/trace/trace.c:3218:46: warning: incorrect type in initializer (different address spaces) /linux/kernel/trace/trace.c:3218:46: expected void const [noderef] __percpu *__vpp_verify /linux/kernel/trace/trace.c:3218:46: got struct trace_buffer_struct * /linux/kernel/trace/trace.c:3234:9: warning: incorrect type in initializer (different address spaces) /linux/kernel/trace/trace.c:3234:9: expected void const [noderef] __percpu *__vpp_verify /linux/kernel/trace/trace.c:3234:9: got int * Link: https://lkml.kernel.org/r/ebabd3f23101d89cb75671b68b6f819f5edc830b.1640255304.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Reported-by: kernel test robot Fixes: 07d777fe8c398 ("tracing: Add percpu buffers for trace_printk()") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5bd5eff5e0e0..119dd5fd5840 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2786,7 +2786,7 @@ struct trace_buffer_struct { char buffer[4][TRACE_BUF_SIZE]; }; -static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* * Thise allows for lockless recording. If we're nested too deeply, then @@ -2815,7 +2815,7 @@ static void put_trace_buf(void) static int alloc_percpu_trace_buffer(void) { - struct trace_buffer_struct *buffers; + struct trace_buffer_struct __percpu *buffers; buffers = alloc_percpu(struct trace_buffer_struct); if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) -- Gitee From 731268506da2f4f872a1a7ac999f2d3efc3a684f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 14 Mar 2022 09:57:34 +0800 Subject: [PATCH 025/340] ipv6: Check attribute length for RTA_GATEWAY in multipath route stable inclusion from linux-4.19.225 commit 44c90b4e6bcbb10f4066b91e8da9c58d4609768c -------------------------------- commit 4619bcf91399f00a40885100fb61d594d8454033 upstream. Commit referenced in the Fixes tag used nla_memcpy for RTA_GATEWAY as does the current nla_get_in6_addr. nla_memcpy protects against accessing memory greater than what is in the attribute, but there is no check requiring the attribute to have an IPv6 address. Add it. Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)") Signed-off-by: David Ahern Cc: Nicolas Dichtel Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv6/route.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 01f8e62302fa..a680498b1178 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4424,6 +4424,19 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); } +static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + if (nla_len(nla) < sizeof(*gw)) { + NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); + return -EINVAL; + } + + *gw = nla_get_in6_addr(nla); + + return 0; +} + static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -4464,7 +4477,13 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - r_cfg.fc_gateway = nla_get_in6_addr(nla); + int ret; + + ret = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, + extack); + if (ret) + return ret; + r_cfg.fc_flags |= RTF_GATEWAY; } r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); -- Gitee From 524a79cfbf597f0b92419d7ed05aa019e325d625 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 14 Mar 2022 09:57:35 +0800 Subject: [PATCH 026/340] ipv6: Check attribute length for RTA_GATEWAY when deleting multipath route stable inclusion from linux-4.19.225 commit e30dead00aaa5a575b10117dad8edc126efc1873 -------------------------------- commit 1ff15a710a862db1101b97810af14aedc835a86a upstream. Make sure RTA_GATEWAY for IPv6 multipath route has enough bytes to hold an IPv6 address. Fixes: 6b9ea5a64ed5 ("ipv6: fix multipath route replace error recovery") Signed-off-by: David Ahern Cc: Roopa Prabhu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv6/route.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a680498b1178..0db7a3ca668a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4617,7 +4617,11 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - nla_memcpy(&r_cfg.fc_gateway, nla, 16); + err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, + extack); + if (err) + return err; + r_cfg.fc_flags |= RTF_GATEWAY; } } -- Gitee From 84502b29bb87b0bcb6790222e19f48e72ecdd984 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 14 Mar 2022 09:57:36 +0800 Subject: [PATCH 027/340] ipv6: Continue processing multipath route even if gateway attribute is invalid stable inclusion from linux-4.19.225 commit e183929b7db0e84a48fdc009a7bcbf9137ad9061 -------------------------------- [ Upstream commit e30a845b0376eb51c9c94f56bbd53b2e08ba822f ] ip6_route_multipath_del loop continues processing the multipath attribute even if delete of a nexthop path fails. For consistency, do the same if the gateway attribute is invalid. Fixes: 1ff15a710a86 ("ipv6: Check attribute length for RTA_GATEWAY when deleting multipath route") Signed-off-by: David Ahern Acked-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20220103171911.94739-1-dsahern@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv6/route.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0db7a3ca668a..e6b021588eb0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4619,8 +4619,10 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, if (nla) { err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, extack); - if (err) - return err; + if (err) { + last_err = err; + goto next_rtnh; + } r_cfg.fc_flags |= RTF_GATEWAY; } @@ -4629,6 +4631,7 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, if (err) last_err = err; +next_rtnh: rtnh = rtnh_next(rtnh, &remaining); } -- Gitee From 96a7f55261aa69fce2594cd193d3e72950898692 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 14 Mar 2022 09:57:37 +0800 Subject: [PATCH 028/340] ipv6: Do cleanup if attribute validation fails in multipath route stable inclusion from linux-4.19.225 commit dd74b4e027324219e51d9821c0db4368c9f8526a -------------------------------- [ Upstream commit 95bdba23b5b4aa75fe3e6c84335e638641c707bb ] As Nicolas noted, if gateway validation fails walking the multipath attribute the code should jump to the cleanup to free previously allocated memory. Fixes: 1ff15a710a86 ("ipv6: Check attribute length for RTA_GATEWAY when deleting multipath route") Signed-off-by: David Ahern Acked-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20220103170555.94638-1-dsahern@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv6/route.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e6b021588eb0..80dd55c436fa 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4477,12 +4477,10 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - int ret; - - ret = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, + err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, extack); - if (ret) - return ret; + if (err) + goto cleanup; r_cfg.fc_flags |= RTF_GATEWAY; } -- Gitee From 453f6d74ffada5e8b76149619375f5a29f729d7a Mon Sep 17 00:00:00 2001 From: Lixiaokeng Date: Mon, 14 Mar 2022 09:57:38 +0800 Subject: [PATCH 029/340] scsi: libiscsi: Fix UAF in iscsi_conn_get_param()/iscsi_conn_teardown() stable inclusion from linux-4.19.225 commit f5596fdc65ce280cb4debb507c0c5877d1a89936 -------------------------------- [ Upstream commit 1b8d0300a3e9f216ae4901bab886db7299899ec6 ] |- iscsi_if_destroy_conn |-dev_attr_show |-iscsi_conn_teardown |-spin_lock_bh |-iscsi_sw_tcp_conn_get_param |-kfree(conn->persistent_address) |-iscsi_conn_get_param |-kfree(conn->local_ipaddr) ==>|-read persistent_address ==>|-read local_ipaddr |-spin_unlock_bh When iscsi_conn_teardown() and iscsi_conn_get_param() happen in parallel, a UAF may be triggered. Link: https://lore.kernel.org/r/046ec8a0-ce95-d3fc-3235-666a7c65b224@huawei.com Reported-by: Lu Tixiong Reviewed-by: Mike Christie Reviewed-by: Lee Duncan Signed-off-by: Lixiaokeng Signed-off-by: Linfeilong Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/scsi/libiscsi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 02060aec08bf..a93a47d4931f 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -3133,6 +3133,8 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) { struct iscsi_conn *conn = cls_conn->dd_data; struct iscsi_session *session = conn->session; + char *tmp_persistent_address = conn->persistent_address; + char *tmp_local_ipaddr = conn->local_ipaddr; del_timer_sync(&conn->transport_timer); @@ -3156,8 +3158,6 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) spin_lock_bh(&session->frwd_lock); free_pages((unsigned long) conn->data, get_order(ISCSI_DEF_MAX_RECV_SEG_LEN)); - kfree(conn->persistent_address); - kfree(conn->local_ipaddr); /* regular RX path uses back_lock */ spin_lock_bh(&session->back_lock); kfifo_in(&session->cmdpool.queue, (void*)&conn->login_task, @@ -3169,6 +3169,8 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) mutex_unlock(&session->eh_mutex); iscsi_destroy_conn(cls_conn); + kfree(tmp_persistent_address); + kfree(tmp_local_ipaddr); } EXPORT_SYMBOL_GPL(iscsi_conn_teardown); -- Gitee From fbdd31a66fd4702dbccdf3a0b6458246b7b1c5c5 Mon Sep 17 00:00:00 2001 From: William Zhao Date: Mon, 14 Mar 2022 09:57:39 +0800 Subject: [PATCH 030/340] ip6_vti: initialize __ip6_tnl_parm struct in vti6_siocdevprivate stable inclusion from linux-4.19.225 commit 6f1ce654347ad16977afcb315fd9a9114ed9739d -------------------------------- [ Upstream commit c1833c3964d5bd8c163bd4e01736a38bc473cb8a ] The "__ip6_tnl_parm" struct was left uninitialized causing an invalid load of random data when the "__ip6_tnl_parm" struct was used elsewhere. As an example, in the function "ip6_tnl_xmit_ctl()", it tries to access the "collect_md" member. With "__ip6_tnl_parm" being uninitialized and containing random data, the UBSAN detected that "collect_md" held a non-boolean value. The UBSAN issue is as follows: =============================================================== UBSAN: invalid-load in net/ipv6/ip6_tunnel.c:1025:14 load of value 30 is not a valid value for type '_Bool' CPU: 1 PID: 228 Comm: kworker/1:3 Not tainted 5.16.0-rc4+ #8 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work Call Trace: dump_stack_lvl+0x44/0x57 ubsan_epilogue+0x5/0x40 __ubsan_handle_load_invalid_value+0x66/0x70 ? __cpuhp_setup_state+0x1d3/0x210 ip6_tnl_xmit_ctl.cold.52+0x2c/0x6f [ip6_tunnel] vti6_tnl_xmit+0x79c/0x1e96 [ip6_vti] ? lock_is_held_type+0xd9/0x130 ? vti6_rcv+0x100/0x100 [ip6_vti] ? lock_is_held_type+0xd9/0x130 ? rcu_read_lock_bh_held+0xc0/0xc0 ? lock_acquired+0x262/0xb10 dev_hard_start_xmit+0x1e6/0x820 __dev_queue_xmit+0x2079/0x3340 ? mark_lock.part.52+0xf7/0x1050 ? netdev_core_pick_tx+0x290/0x290 ? kvm_clock_read+0x14/0x30 ? kvm_sched_clock_read+0x5/0x10 ? sched_clock_cpu+0x15/0x200 ? find_held_lock+0x3a/0x1c0 ? lock_release+0x42f/0xc90 ? lock_downgrade+0x6b0/0x6b0 ? mark_held_locks+0xb7/0x120 ? neigh_connected_output+0x31f/0x470 ? lockdep_hardirqs_on+0x79/0x100 ? neigh_connected_output+0x31f/0x470 ? ip6_finish_output2+0x9b0/0x1d90 ? rcu_read_lock_bh_held+0x62/0xc0 ? ip6_finish_output2+0x9b0/0x1d90 ip6_finish_output2+0x9b0/0x1d90 ? ip6_append_data+0x330/0x330 ? ip6_mtu+0x166/0x370 ? __ip6_finish_output+0x1ad/0xfb0 ? nf_hook_slow+0xa6/0x170 ip6_output+0x1fb/0x710 ? nf_hook.constprop.32+0x317/0x430 ? ip6_finish_output+0x180/0x180 ? __ip6_finish_output+0xfb0/0xfb0 ? lock_is_held_type+0xd9/0x130 ndisc_send_skb+0xb33/0x1590 ? __sk_mem_raise_allocated+0x11cf/0x1560 ? dst_output+0x4a0/0x4a0 ? ndisc_send_rs+0x432/0x610 addrconf_dad_completed+0x30c/0xbb0 ? addrconf_rs_timer+0x650/0x650 ? addrconf_dad_work+0x73c/0x10e0 addrconf_dad_work+0x73c/0x10e0 ? addrconf_dad_completed+0xbb0/0xbb0 ? rcu_read_lock_sched_held+0xaf/0xe0 ? rcu_read_lock_bh_held+0xc0/0xc0 process_one_work+0x97b/0x1740 ? pwq_dec_nr_in_flight+0x270/0x270 worker_thread+0x87/0xbf0 ? process_one_work+0x1740/0x1740 kthread+0x3ac/0x490 ? set_kthread_struct+0x100/0x100 ret_from_fork+0x22/0x30 =============================================================== The solution is to initialize "__ip6_tnl_parm" struct to zeros in the "vti6_siocdevprivate()" function. Signed-off-by: William Zhao Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv6/ip6_vti.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 290badfe70e0..866ce815625e 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -799,6 +799,8 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); + memset(&p1, 0, sizeof(p1)); + switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { -- Gitee From 31cad6a126132fc5266c830e25db495149d7c429 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Mar 2022 09:57:40 +0800 Subject: [PATCH 031/340] can: bcm: switch timer to HRTIMER_MODE_SOFT and remove hrtimer_tasklet stable inclusion from linux-4.19.226 commit 79305a826f872fe446c6fbf8450f515053ef6951 -------------------------------- commit bf74aa86e111aa3b2fbb25db37e3a3fab71b5b68 upstream. This patch switches the timer to HRTIMER_MODE_SOFT, which executed the timer callback in softirq context and removes the hrtimer_tasklet. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Ziyang Xuan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/can/bcm.c | 156 +++++++++++++++++--------------------------------- 1 file changed, 52 insertions(+), 104 deletions(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index e66377e764ba..ffc34e911808 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -105,7 +105,6 @@ struct bcm_op { unsigned long frames_abs, frames_filtered; struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; - struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; int cfsiz; @@ -370,25 +369,34 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, } } -static void bcm_tx_start_timer(struct bcm_op *op) +static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt) { + ktime_t ival; + if (op->kt_ival1 && op->count) - hrtimer_start(&op->timer, - ktime_add(ktime_get(), op->kt_ival1), - HRTIMER_MODE_ABS); + ival = op->kt_ival1; else if (op->kt_ival2) - hrtimer_start(&op->timer, - ktime_add(ktime_get(), op->kt_ival2), - HRTIMER_MODE_ABS); + ival = op->kt_ival2; + else + return false; + + hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival)); + return true; } -static void bcm_tx_timeout_tsklet(unsigned long data) +static void bcm_tx_start_timer(struct bcm_op *op) { - struct bcm_op *op = (struct bcm_op *)data; + if (bcm_tx_set_expiry(op, &op->timer)) + hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT); +} + +/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ +static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) +{ + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { - op->count--; if (!op->count && (op->flags & TX_COUNTEVT)) { @@ -406,22 +414,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data) } bcm_can_tx(op); - } else if (op->kt_ival2) + } else if (op->kt_ival2) { bcm_can_tx(op); + } - bcm_tx_start_timer(op); -} - -/* - * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions - */ -static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) -{ - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); - - tasklet_schedule(&op->tsklet); - - return HRTIMER_NORESTART; + return bcm_tx_set_expiry(op, &op->timer) ? + HRTIMER_RESTART : HRTIMER_NORESTART; } /* @@ -488,7 +486,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op, /* do not send the saved data - only start throttle timer */ hrtimer_start(&op->thrtimer, ktime_add(op->kt_lastmsg, op->kt_ival2), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_SOFT); return; } @@ -547,14 +545,21 @@ static void bcm_rx_starttimer(struct bcm_op *op) return; if (op->kt_ival1) - hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); + hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } -static void bcm_rx_timeout_tsklet(unsigned long data) +/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ +static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) { - struct bcm_op *op = (struct bcm_op *)data; + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; + /* if user wants to be informed, when cyclic CAN-Messages come back */ + if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { + /* clear received CAN frames to indicate 'nothing received' */ + memset(op->last_frames, 0, op->nframes * op->cfsiz); + } + /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; @@ -566,25 +571,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data) msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); -} - -/* - * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out - */ -static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) -{ - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); - - /* schedule before NET_RX_SOFTIRQ */ - tasklet_hi_schedule(&op->tsklet); - - /* no restart of the timer is done here! */ - - /* if user wants to be informed, when cyclic CAN-Messages come back */ - if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { - /* clear received CAN frames to indicate 'nothing received' */ - memset(op->last_frames, 0, op->nframes * op->cfsiz); - } return HRTIMER_NORESTART; } @@ -592,14 +578,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ -static inline int bcm_rx_do_flush(struct bcm_op *op, int update, - unsigned int index) +static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index) { struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; if ((op->last_frames) && (lcf->flags & RX_THR)) { - if (update) - bcm_rx_changed(op, lcf); + bcm_rx_changed(op, lcf); return 1; } return 0; @@ -607,11 +591,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update, /* * bcm_rx_thr_flush - Check for throttled data and send it to the userspace - * - * update == 0 : just check if throttled data is available (any irq context) - * update == 1 : check and send throttled data to userspace (soft_irq context) */ -static int bcm_rx_thr_flush(struct bcm_op *op, int update) +static int bcm_rx_thr_flush(struct bcm_op *op) { int updated = 0; @@ -620,24 +601,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update) /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) - updated += bcm_rx_do_flush(op, update, i); + updated += bcm_rx_do_flush(op, i); } else { /* for RX_FILTER_ID and simple filter */ - updated += bcm_rx_do_flush(op, update, 0); + updated += bcm_rx_do_flush(op, 0); } return updated; } -static void bcm_rx_thr_tsklet(unsigned long data) -{ - struct bcm_op *op = (struct bcm_op *)data; - - /* push the changed data to the userspace */ - bcm_rx_thr_flush(op, 1); -} - /* * bcm_rx_thr_handler - the time for blocked content updates is over now: * Check for throttled data and send it to the userspace @@ -646,9 +619,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); - tasklet_schedule(&op->thrtsklet); - - if (bcm_rx_thr_flush(op, 0)) { + if (bcm_rx_thr_flush(op)) { hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2); return HRTIMER_RESTART; } else { @@ -744,23 +715,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, static void bcm_remove_op(struct bcm_op *op) { - if (op->tsklet.func) { - while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) || - test_bit(TASKLET_STATE_RUN, &op->tsklet.state) || - hrtimer_active(&op->timer)) { - hrtimer_cancel(&op->timer); - tasklet_kill(&op->tsklet); - } - } - - if (op->thrtsklet.func) { - while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) || - test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) || - hrtimer_active(&op->thrtimer)) { - hrtimer_cancel(&op->thrtimer); - tasklet_kill(&op->thrtsklet); - } - } + hrtimer_cancel(&op->timer); + hrtimer_cancel(&op->thrtimer); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); @@ -994,15 +950,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_tx_timeout_handler; - /* initialize tasklet for tx countevent notification */ - tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet, - (unsigned long) op); - /* currently unused in tx_ops */ - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); @@ -1171,20 +1125,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_rx_timeout_handler; - /* initialize tasklet for rx timeout notification */ - tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet, - (unsigned long) op); - - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->thrtimer.function = bcm_rx_thr_handler; - /* initialize tasklet for rx throttle handling */ - tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet, - (unsigned long) op); - /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); @@ -1230,12 +1178,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, */ op->kt_lastmsg = 0; hrtimer_cancel(&op->thrtimer); - bcm_rx_thr_flush(op, 1); + bcm_rx_thr_flush(op); } if ((op->flags & STARTTIMER) && op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_SOFT); } /* now we can register for can_ids, if we added a new bcm_op */ -- Gitee From 6d8a0b1475a1327444198df5626f58dd650c414c Mon Sep 17 00:00:00 2001 From: Gang Li Date: Mon, 14 Mar 2022 09:57:41 +0800 Subject: [PATCH 032/340] shmem: fix a race between shmem_unused_huge_shrink and shmem_evict_inode stable inclusion from linux-4.19.226 commit 9a3354fc78b7d7a79026053272bb0b74900c28c1 -------------------------------- commit 62c9827cbb996c2c04f615ecd783ce28bcea894b upstream. Fix a data race in commit 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure"). Here are call traces causing race: Call Trace 1: shmem_unused_huge_shrink+0x3ae/0x410 ? __list_lru_walk_one.isra.5+0x33/0x160 super_cache_scan+0x17c/0x190 shrink_slab.part.55+0x1ef/0x3f0 shrink_node+0x10e/0x330 kswapd+0x380/0x740 kthread+0xfc/0x130 ? mem_cgroup_shrink_node+0x170/0x170 ? kthread_create_on_node+0x70/0x70 ret_from_fork+0x1f/0x30 Call Trace 2: shmem_evict_inode+0xd8/0x190 evict+0xbe/0x1c0 do_unlinkat+0x137/0x330 do_syscall_64+0x76/0x120 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 A simple explanation: Image there are 3 items in the local list (@list). In the first traversal, A is not deleted from @list. 1) A->B->C ^ | pos (leave) In the second traversal, B is deleted from @list. Concurrently, A is deleted from @list through shmem_evict_inode() since last reference counter of inode is dropped by other thread. Then the @list is corrupted. 2) A->B->C ^ ^ | | evict pos (drop) We should make sure the inode is either on the global list or deleted from any local list before iput(). Fixed by moving inodes back to global list before we put them. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/20211125064502.99983-1-ligang.bdlg@bytedance.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Gang Li Reviewed-by: Muchun Song Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- mm/shmem.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index bc62dc732781..4363dbc8d57e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -541,7 +541,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shmem_inode_info *info; struct page *page; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -556,7 +556,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; } @@ -564,12 +563,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; } list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@ -589,7 +588,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, inode = &info->vfs_inode; if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back; page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@ -603,38 +602,44 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } /* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; } ret = split_huge_page(page); unlock_page(page); put_page(page); - /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back; split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); } - spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; } -- Gitee From 3e6f1f0c864b6ce739cfe25ac168f789a36633e9 Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Mon, 14 Mar 2022 09:57:42 +0800 Subject: [PATCH 033/340] crypto: qce - fix uaf on qce_ahash_register_one stable inclusion from linux-4.19.226 commit 1aa6bac2dc1bce5f6afbb095c0ace374cc653947 -------------------------------- [ Upstream commit b4cb4d31631912842eb7dce02b4350cbb7562d5e ] Pointer base points to sub field of tmpl, it is dereferenced after tmpl is freed. Fix this by accessing base before free tmpl. Fixes: ec8f5d8f ("crypto: qce - Qualcomm crypto engine driver") Signed-off-by: Chengfeng Ye Acked-by: Thara Gopinath Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/crypto/qce/sha.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c index d8a5db11b7ea..bffd4d15145d 100644 --- a/drivers/crypto/qce/sha.c +++ b/drivers/crypto/qce/sha.c @@ -521,8 +521,8 @@ static int qce_ahash_register_one(const struct qce_ahash_def *def, ret = crypto_register_ahash(alg); if (ret) { - kfree(tmpl); dev_err(qce->dev, "%s registration failed\n", base->cra_name); + kfree(tmpl); return ret; } -- Gitee From 9033611692c77c7b38bbfd5fa098850e3025b6f8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Mar 2022 09:57:43 +0800 Subject: [PATCH 034/340] netfilter: bridge: add support for pppoe filtering stable inclusion from linux-4.19.226 commit 519f563eca1b89965dd406a8f9807ca76562d202 -------------------------------- [ Upstream commit 28b78ecffea8078d81466b2e01bb5a154509f1ba ] This makes 'bridge-nf-filter-pppoe-tagged' sysctl work for bridged traffic. Looking at the original commit it doesn't appear this ever worked: static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, [..] if (skb->protocol == htons(ETH_P_8021Q)) { skb_pull(skb, VLAN_HLEN); skb->network_header += VLAN_HLEN; + } else if (skb->protocol == htons(ETH_P_PPP_SES)) { + skb_pull(skb, PPPOE_SES_HLEN); + skb->network_header += PPPOE_SES_HLEN; } [..] NF_HOOK(... POST_ROUTING, ...) ... but the adjusted offsets are never restored. The alternative would be to rip this code out for good, but otoh we'd have to keep this anyway for the vlan handling (which works because vlan tag info is in the skb, not the packet payload). Reported-and-tested-by: Amish Chana Fixes: 516299d2f5b6f97 ("[NETFILTER]: bridge-nf: filter bridged IPv4/IPv6 encapsulated in pppoe traffic") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/bridge/br_netfilter_hooks.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index a403179eea49..fd5e3470560a 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -727,6 +727,9 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); @@ -744,8 +747,6 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; - nf_bridge_update_protocol(skb); - data = this_cpu_ptr(&brnf_frag_data_storage); data->vlan_tci = skb->vlan_tci; @@ -768,8 +769,6 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; - nf_bridge_update_protocol(skb); - data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; -- Gitee From 00128133d5b8c69d9f09e7f3819b7e3c949e2fb4 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Mon, 14 Mar 2022 09:57:44 +0800 Subject: [PATCH 035/340] tty: serial: uartlite: allow 64 bit address stable inclusion from linux-4.19.226 commit 628c2372100a118f5500f596c5c941720adbcc28 -------------------------------- [ Upstream commit 3672fb65155530b5eea6225685c75329b6debec3 ] The base address of uartlite registers could be 64 bit address which is from device resource. When ulite_probe() calls ulite_assign(), this 64 bit address is casted to 32-bit. The fix is to replace "u32" type with "phys_addr_t" type for the base address in ulite_assign() argument list. Fixes: 8fa7b6100693 ("[POWERPC] Uartlite: Separate the bus binding from the driver proper") Signed-off-by: Lizhi Hou Link: https://lore.kernel.org/r/20211129202302.1319033-1-lizhi.hou@xilinx.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/serial/uartlite.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/uartlite.c b/drivers/tty/serial/uartlite.c index 8df305822668..5d1b7455e627 100644 --- a/drivers/tty/serial/uartlite.c +++ b/drivers/tty/serial/uartlite.c @@ -618,7 +618,7 @@ static struct uart_driver ulite_uart_driver = { * * Returns: 0 on success, <0 otherwise */ -static int ulite_assign(struct device *dev, int id, u32 base, int irq, +static int ulite_assign(struct device *dev, int id, phys_addr_t base, int irq, struct uartlite_data *pdata) { struct uart_port *port; -- Gitee From ba8b1d7d1ec67cb3f27316bfcfb9a63e531be6f7 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 14 Mar 2022 09:57:45 +0800 Subject: [PATCH 036/340] serial: amba-pl011: do not request memory region twice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from linux-4.19.226 commit 0d9a1f0245ed6cadb37fabdf204b67f230160ca0 -------------------------------- [ Upstream commit d1180405c7b5c7a1c6bde79d5fc24fe931430737 ] With commit 3873e2d7f63a ("drivers: PL011: refactor pl011_probe()") the function devm_ioremap() called from pl011_setup_port() was replaced with devm_ioremap_resource(). Since this function not only remaps but also requests the ports io memory region it now collides with the .config_port() callback which requests the same region at uart port registration. Since devm_ioremap_resource() already claims the memory successfully, the request in .config_port() fails. Later at uart port deregistration the attempt to release the unclaimed memory also fails. The failure results in a “Trying to free nonexistent resource" warning. Fix these issues by removing the callbacks that implement the redundant memory allocation/release. Also make sure that changing the drivers io memory base address via TIOCSSERIAL is not allowed any more. Fixes: 3873e2d7f63a ("drivers: PL011: refactor pl011_probe()") Signed-off-by: Lino Sanfilippo Link: https://lore.kernel.org/r/20211129174238.8333-1-LinoSanfilippo@gmx.de Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/serial/amba-pl011.c | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 1d27e006826d..aae97acd183a 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2160,32 +2160,13 @@ static const char *pl011_type(struct uart_port *port) return uap->port.type == PORT_AMBA ? uap->type : NULL; } -/* - * Release the memory region(s) being used by 'port' - */ -static void pl011_release_port(struct uart_port *port) -{ - release_mem_region(port->mapbase, SZ_4K); -} - -/* - * Request the memory region(s) being used by 'port' - */ -static int pl011_request_port(struct uart_port *port) -{ - return request_mem_region(port->mapbase, SZ_4K, "uart-pl011") - != NULL ? 0 : -EBUSY; -} - /* * Configure/autoconfigure the port. */ static void pl011_config_port(struct uart_port *port, int flags) { - if (flags & UART_CONFIG_TYPE) { + if (flags & UART_CONFIG_TYPE) port->type = PORT_AMBA; - pl011_request_port(port); - } } /* @@ -2200,6 +2181,8 @@ static int pl011_verify_port(struct uart_port *port, struct serial_struct *ser) ret = -EINVAL; if (ser->baud_base < 9600) ret = -EINVAL; + if (port->mapbase != (unsigned long) ser->iomem_base) + ret = -EINVAL; return ret; } @@ -2217,8 +2200,6 @@ static const struct uart_ops amba_pl011_pops = { .flush_buffer = pl011_dma_flush_buffer, .set_termios = pl011_set_termios, .type = pl011_type, - .release_port = pl011_release_port, - .request_port = pl011_request_port, .config_port = pl011_config_port, .verify_port = pl011_verify_port, #ifdef CONFIG_CONSOLE_POLL @@ -2248,8 +2229,6 @@ static const struct uart_ops sbsa_uart_pops = { .shutdown = sbsa_uart_shutdown, .set_termios = sbsa_uart_set_termios, .type = pl011_type, - .release_port = pl011_release_port, - .request_port = pl011_request_port, .config_port = pl011_config_port, .verify_port = pl011_verify_port, #ifdef CONFIG_CONSOLE_POLL -- Gitee From cc97ecdb978753debcc99677f337aa231851d2f9 Mon Sep 17 00:00:00 2001 From: Li Hua Date: Mon, 14 Mar 2022 09:57:46 +0800 Subject: [PATCH 037/340] sched/rt: Try to restart rt period timer when rt runtime exceeded stable inclusion from linux-4.19.226 commit e6bc7279b16517fab9ed3cdbd58ad7b08060c246 -------------------------------- [ Upstream commit 9b58e976b3b391c0cf02e038d53dd0478ed3013c ] When rt_runtime is modified from -1 to a valid control value, it may cause the task to be throttled all the time. Operations like the following will trigger the bug. E.g: 1. echo -1 > /proc/sys/kernel/sched_rt_runtime_us 2. Run a FIFO task named A that executes while(1) 3. echo 950000 > /proc/sys/kernel/sched_rt_runtime_us When rt_runtime is -1, The rt period timer will not be activated when task A enqueued. And then the task will be throttled after setting rt_runtime to 950,000. The task will always be throttled because the rt period timer is not activated. Fixes: d0b27fa77854 ("sched: rt-group: synchonised bandwidth period") Reported-by: Hulk Robot Signed-off-by: Li Hua Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211203033618.11895-1-hucool.lihua@huawei.com Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/sched/rt.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5a9c27956792..301ba04d9130 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -50,11 +50,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.function = sched_rt_period_timer; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) { - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; @@ -72,6 +69,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + do_start_rt_bandwidth(rt_b); +} + void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -979,13 +984,17 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } } @@ -2654,8 +2663,12 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { + unsigned long flags; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } int sched_rt_handler(struct ctl_table *table, int write, -- Gitee From 406dab9bc0d0014f9019bba17d0beb8bff7bee27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Mar 2022 09:57:47 +0800 Subject: [PATCH 038/340] xfrm: fix a small bug in xfrm_sa_len() stable inclusion from linux-4.19.226 commit fcb8f503d607d18b9fc69613e38358aae9f9fa99 -------------------------------- [ Upstream commit 7770a39d7c63faec6c4f33666d49a8cb664d0482 ] copy_user_offload() will actually push a struct struct xfrm_user_offload, which is different than (struct xfrm_state *)->xso (struct xfrm_state_offload) Fixes: d77e38e612a01 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Eric Dumazet Cc: Steffen Klassert Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index f94abe1fdd58..87932f6ad9d7 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2813,7 +2813,7 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) if (x->props.extra_flags) l += nla_total_size(sizeof(x->props.extra_flags)); if (x->xso.dev) - l += nla_total_size(sizeof(x->xso)); + l += nla_total_size(sizeof(struct xfrm_user_offload)); if (x->props.smark.v | x->props.smark.m) { l += nla_total_size(sizeof(x->props.smark.v)); l += nla_total_size(sizeof(x->props.smark.m)); -- Gitee From e25cedfd876426cdd35d7d46b63ed3a82ab6ac14 Mon Sep 17 00:00:00 2001 From: Nicolas Toromanoff Date: Mon, 14 Mar 2022 09:57:48 +0800 Subject: [PATCH 039/340] crypto: stm32/cryp - fix double pm exit stable inclusion from linux-4.19.226 commit a803ac39fb163677b4eaf5317894a3dd62912b56 -------------------------------- [ Upstream commit 6c12e742785bf9333faf60bfb96575bdd763448e ] Delete extraneous lines in probe error handling code: pm was disabled twice. Fixes: 65f9aa36ee47 ("crypto: stm32/cryp - Add power management support") Reported-by: Marek Vasut Signed-off-by: Nicolas Toromanoff Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/crypto/stm32/stm32-cryp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 23b0b7bd64c7..b3b49dce1136 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -2036,8 +2036,6 @@ static int stm32_cryp_probe(struct platform_device *pdev) list_del(&cryp->list); spin_unlock(&cryp_list.lock); - pm_runtime_disable(dev); - pm_runtime_put_noidle(dev); pm_runtime_disable(dev); pm_runtime_put_noidle(dev); -- Gitee From 68c0fe9e15296eab661ff7efc014c8c6e64dfa9d Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Mon, 14 Mar 2022 09:57:49 +0800 Subject: [PATCH 040/340] xfrm: interface with if_id 0 should return error stable inclusion from linux-4.19.226 commit f7594c07fb0270c5414460432d3344b8831789c2 -------------------------------- [ Upstream commit 8dce43919566f06e865f7e8949f5c10d8c2493f5 ] xfrm interface if_id = 0 would cause xfrm policy lookup errors since Commit 9f8550e4bd9d. Now explicitly fail to create an xfrm interface when if_id = 0 With this commit: ip link add ipsec0 type xfrm dev lo if_id 0 Error: if_id must be non zero. v1->v2 change: - add Fixes: tag Fixes: 9f8550e4bd9d ("xfrm: fix disable_xfrm sysctl when used on xfrm interfaces") Signed-off-by: Antony Antony Reviewed-by: Eyal Birger Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/xfrm/xfrm_interface.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 9dd43319b2c0..6bfd7b8249da 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -662,11 +662,16 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); - struct xfrm_if_parms p; + struct xfrm_if_parms p = {}; struct xfrm_if *xi; int err; xfrmi_netlink_parms(data, &p); + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; @@ -691,7 +696,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; - struct xfrm_if_parms p; + struct xfrm_if_parms p = {}; + + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } xfrmi_netlink_parms(data, &p); xi = xfrmi_locate(net, &p); -- Gitee From 38af239f108f6a602f813712a39798f4425f3a86 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Mon, 14 Mar 2022 09:57:50 +0800 Subject: [PATCH 041/340] xfrm: state and policy should fail if XFRMA_IF_ID 0 stable inclusion from linux-4.19.226 commit a6ea2625df628d4d18b86f6ee21835c8962c8f23 -------------------------------- [ Upstream commit 68ac0f3810e76a853b5f7b90601a05c3048b8b54 ] xfrm ineterface does not allow xfrm if_id = 0 fail to create or update xfrm state and policy. With this commit: ip xfrm policy add src 192.0.2.1 dst 192.0.2.2 dir out if_id 0 RTNETLINK answers: Invalid argument ip xfrm state add src 192.0.2.1 dst 192.0.2.2 proto esp spi 1 \ reqid 1 mode tunnel aead 'rfc4106(gcm(aes))' \ 0x1111111111111111111111111111111111111111 96 if_id 0 RTNETLINK answers: Invalid argument v1->v2 change: - add Fixes: tag Fixes: 9f8550e4bd9d ("xfrm: fix disable_xfrm sysctl when used on xfrm interfaces") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/xfrm/xfrm_user.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 87932f6ad9d7..8d8f9e778cd4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -620,8 +620,13 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); - if (attrs[XFRMA_IF_ID]) + if (attrs[XFRMA_IF_ID]) { x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (!x->if_id) { + err = -EINVAL; + goto error; + } + } err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -1327,8 +1332,13 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, mark = xfrm_mark_get(attrs, &m); - if (attrs[XFRMA_IF_ID]) + if (attrs[XFRMA_IF_ID]) { if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (!if_id) { + err = -EINVAL; + goto out_noput; + } + } if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); @@ -1630,8 +1640,13 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); - if (attrs[XFRMA_IF_ID]) + if (attrs[XFRMA_IF_ID]) { xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (!xp->if_id) { + err = -EINVAL; + goto error; + } + } return xp; error: -- Gitee From 0f8a88115ed26bd9225626e5ebae5f23bb9cea0a Mon Sep 17 00:00:00 2001 From: Xin Xiong Date: Mon, 14 Mar 2022 09:57:51 +0800 Subject: [PATCH 042/340] netfilter: ipt_CLUSTERIP: fix refcount leak in clusterip_tg_check() stable inclusion from linux-4.19.226 commit f0351a2178fa64f87e609c341f07bf4260b10daf -------------------------------- [ Upstream commit d94a69cb2cfa77294921aae9afcfb866e723a2da ] The issue takes place in one error path of clusterip_tg_check(). When memcmp() returns nonzero, the function simply returns the error code, forgetting to decrease the reference count of a clusterip_config object, which is bumped earlier by clusterip_config_find_get(). This may incur reference count leak. Fix this issue by decrementing the refcount of the object in specific error path. Fixes: 06aa151ad1fc74 ("netfilter: ipt_CLUSTERIP: check MAC address when duplicate config is set") Signed-off-by: Xin Xiong Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2c11050d679b..08297d3504cf 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -509,8 +509,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) if (IS_ERR(config)) return PTR_ERR(config); } - } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) + } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) { + clusterip_config_entry_put(config); + clusterip_config_put(config); return -EINVAL; + } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { -- Gitee From eb30ebe770044b4a4ae36dfc45c3f550553a0bd5 Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Mon, 14 Mar 2022 09:57:52 +0800 Subject: [PATCH 043/340] tpm: add request_locality before write TPM_INT_ENABLE stable inclusion from linux-4.19.226 commit f155b0fbde518857ee2d42f1fbfda6bdf0123363 -------------------------------- [ Upstream commit 0ef333f5ba7f24f5d8478425c163d3097f1c7afd ] Locality is not appropriately requested before writing the int mask. Add the missing boilerplate. Fixes: e6aef069b6e9 ("tpm_tis: convert to using locality callbacks") Signed-off-by: Chen Jun Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/char/tpm/tpm_tis_core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 0aade05f47fa..a8e8289e0b35 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -918,7 +918,15 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, intmask |= TPM_INTF_CMD_READY_INT | TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_DATA_AVAIL_INT | TPM_INTF_STS_VALID_INT; intmask &= ~TPM_GLOBAL_INT_ENABLE; + + rc = request_locality(chip, 0); + if (rc < 0) { + rc = -ENODEV; + goto out_err; + } + tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask); + release_locality(chip, 0); rc = tpm2_probe(chip); if (rc) -- Gitee From caed2b609b8cc54ecb52b00c8b8f73148cfd66e3 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Mon, 14 Mar 2022 09:57:53 +0800 Subject: [PATCH 044/340] net: mcs7830: handle usb read errors properly stable inclusion from linux-4.19.226 commit e0b2cd310d0859907a9a7b4e4ef9a5bcce9d3d6d -------------------------------- [ Upstream commit d668769eb9c52b150753f1653f7f5a0aeb8239d2 ] Syzbot reported uninit value in mcs7830_bind(). The problem was in missing validation check for bytes read via usbnet_read_cmd(). usbnet_read_cmd() internally calls usb_control_msg(), that returns number of bytes read. Code should validate that requested number of bytes was actually read. So, this patch adds missing size validation check inside mcs7830_get_reg() to prevent uninit value bugs Reported-and-tested-by: syzbot+003c0a286b9af5412510@syzkaller.appspotmail.com Fixes: 2a36d7083438 ("USB: driver for mcs7830 (aka DeLOCK) USB ethernet adapter") Signed-off-by: Pavel Skripkin Reviewed-by: Arnd Bergmann Link: https://lore.kernel.org/r/20220106225716.7425-1-paskripkin@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/net/usb/mcs7830.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c index 5a47e5510ca8..c0f52a622964 100644 --- a/drivers/net/usb/mcs7830.c +++ b/drivers/net/usb/mcs7830.c @@ -121,8 +121,16 @@ static const char driver_name[] = "MOSCHIP usb-ethernet driver"; static int mcs7830_get_reg(struct usbnet *dev, u16 index, u16 size, void *data) { - return usbnet_read_cmd(dev, MCS7830_RD_BREQ, MCS7830_RD_BMREQ, - 0x0000, index, data, size); + int ret; + + ret = usbnet_read_cmd(dev, MCS7830_RD_BREQ, MCS7830_RD_BMREQ, + 0x0000, index, data, size); + if (ret < 0) + return ret; + else if (ret < size) + return -ENODATA; + + return ret; } static int mcs7830_set_reg(struct usbnet *dev, u16 index, u16 size, const void *data) -- Gitee From f19bc9786b7095592eb60ed4e425a35fee0f2b3f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Mar 2022 09:57:54 +0800 Subject: [PATCH 045/340] ext4: avoid trim error on fs with small groups stable inclusion from linux-4.19.226 commit 1105c2dac8cb45c297401e32884cbefc0f123301 -------------------------------- [ Upstream commit 173b6e383d2a204c9921ffc1eca3b87aa2106c33 ] A user reported FITRIM ioctl failing for him on ext4 on some devices without apparent reason. After some debugging we've found out that these devices (being LVM volumes) report rather large discard granularity of 42MB and the filesystem had 1k blocksize and thus group size of 8MB. Because ext4 FITRIM implementation puts discard granularity into minlen, ext4_trim_fs() declared the trim request as invalid. However just silently doing nothing seems to be a more appropriate reaction to such combination of parameters since user did not specify anything wrong. CC: Lukas Czerner Fixes: 5c2ed62fd447 ("ext4: Adjust minlen with discard_granularity in the FITRIM ioctl") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20211112152202.26614-1-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/ext4/ioctl.c | 2 -- fs/ext4/mballoc.c | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c6cac2d40f71..3c8324715e65 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1056,8 +1056,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) sizeof(range))) return -EFAULT; - range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 69ed137d07a9..990fe7eed6bf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5262,6 +5262,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; @@ -5280,6 +5281,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; + /* No point to try to trim less than discard granularity */ + if (range->minlen < q->limits.discard_granularity) { + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + q->limits.discard_granularity >> sb->s_blocksize_bits); + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) + goto out; + } if (end >= max_blks) end = max_blks - 1; if (end <= first_data_blk) -- Gitee From 9ea992512930a42a43860b268679ff5d05909710 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Mon, 14 Mar 2022 09:57:55 +0800 Subject: [PATCH 046/340] iommu/io-pgtable-arm: Fix table descriptor paddr formatting stable inclusion from linux-4.19.226 commit 4208239d748b5a8f32d5c2323d6f385dcb7f9b9e -------------------------------- [ Upstream commit 9abe2ac834851a7d0b0756e295cf7a292c45ca53 ] Table descriptors were being installed without properly formatting the address using paddr_to_iopte, which does not match up with the iopte_deref in __arm_lpae_map. This is incorrect for the LPAE pte format, as it does not handle the high bits properly. This was found on Apple T6000 DARTs, which require a new pte format (different shift); adding support for that to paddr_to_iopte/iopte_to_paddr caused it to break badly, as even <48-bit addresses would end up incorrect in that case. Fixes: 6c89928ff7a0 ("iommu/io-pgtable-arm: Support 52-bit physical address") Acked-by: Robin Murphy Signed-off-by: Hector Martin Link: https://lore.kernel.org/r/20211120031343.88034-1-marcan@marcan.st Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/iommu/io-pgtable-arm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index efd6e994678f..3f99077b3611 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -299,11 +299,12 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table, arm_lpae_iopte *ptep, arm_lpae_iopte curr, - struct io_pgtable_cfg *cfg) + struct arm_lpae_io_pgtable *data) { arm_lpae_iopte old, new; + struct io_pgtable_cfg *cfg = &data->iop.cfg; - new = __pa(table) | ARM_LPAE_PTE_TYPE_TABLE; + new = paddr_to_iopte(__pa(table), data) | ARM_LPAE_PTE_TYPE_TABLE; if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) new |= ARM_LPAE_PTE_NSTABLE; @@ -354,7 +355,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, if (!cptep) return -ENOMEM; - pte = arm_lpae_install_table(cptep, ptep, 0, cfg); + pte = arm_lpae_install_table(cptep, ptep, 0, data); if (pte) __arm_lpae_free_pages(cptep, tblsz, cfg); } else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) { @@ -513,7 +514,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, __arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]); } - pte = arm_lpae_install_table(tablep, ptep, blk_pte, cfg); + pte = arm_lpae_install_table(tablep, ptep, blk_pte, data); if (pte != blk_pte) { __arm_lpae_free_pages(tablep, tablesz, cfg); /* -- Gitee From 878f72d43745fa236696de929b8f84f614c74a1d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 14 Mar 2022 09:57:56 +0800 Subject: [PATCH 047/340] scsi: ufs: Fix race conditions related to driver data stable inclusion from linux-4.19.226 commit 5e7fb994ccc7200e20d7f394aa3f082f4efb49ab -------------------------------- [ Upstream commit 21ad0e49085deb22c094f91f9da57319a97188e4 ] The driver data pointer must be set before any callbacks are registered that use that pointer. Hence move the initialization of that pointer from after the ufshcd_init() call to inside ufshcd_init(). Link: https://lore.kernel.org/r/20211203231950.193369-7-bvanassche@acm.org Fixes: 3b1d05807a9a ("[SCSI] ufs: Segregate PCI Specific Code") Reported-by: Alexey Dobriyan Tested-by: Bean Huo Reviewed-by: Bean Huo Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/scsi/ufs/tc-dwc-g210-pci.c | 1 - drivers/scsi/ufs/ufshcd-pltfrm.c | 2 -- drivers/scsi/ufs/ufshcd.c | 7 +++++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/ufs/tc-dwc-g210-pci.c b/drivers/scsi/ufs/tc-dwc-g210-pci.c index 2f41722a8c28..2c6cb7f6b61a 100644 --- a/drivers/scsi/ufs/tc-dwc-g210-pci.c +++ b/drivers/scsi/ufs/tc-dwc-g210-pci.c @@ -138,7 +138,6 @@ tc_dwc_g210_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return err; } - pci_set_drvdata(pdev, hba); pm_runtime_put_noidle(&pdev->dev); pm_runtime_allow(&pdev->dev); diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.c b/drivers/scsi/ufs/ufshcd-pltfrm.c index 30c22e16b1e3..57985841a879 100644 --- a/drivers/scsi/ufs/ufshcd-pltfrm.c +++ b/drivers/scsi/ufs/ufshcd-pltfrm.c @@ -348,8 +348,6 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, goto dealloc_host; } - platform_set_drvdata(pdev, hba); - pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 3d9bfa520768..2082907bea50 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8032,6 +8032,13 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) struct Scsi_Host *host = hba->host; struct device *dev = hba->dev; + /* + * dev_set_drvdata() must be called before any callbacks are registered + * that use dev_get_drvdata() (frequency scaling, clock scaling, hwmon, + * sysfs). + */ + dev_set_drvdata(dev, hba); + if (!mmio_base) { dev_err(hba->dev, "Invalid memory reference for mmio_base is NULL\n"); -- Gitee From 15ff35bb61a1c972f98eb8c74680be24de4c2c5d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 14 Mar 2022 09:57:57 +0800 Subject: [PATCH 048/340] dmaengine: pxa/mmp: stop referencing config->slave_id stable inclusion from linux-4.19.226 commit c976850fa06cca6675f24fdd5b44cb13ea9d7189 -------------------------------- [ Upstream commit 134c37fa250a87a7e77c80a7c59ae16c462e46e0 ] The last driver referencing the slave_id on Marvell PXA and MMP platforms was the SPI driver, but this stopped doing so a long time ago, so the TODO from the earlier patch can no be removed. Fixes: b729bf34535e ("spi/pxa2xx: Don't use slave_id of dma_slave_config") Fixes: 13b3006b8ebd ("dma: mmp_pdma: add filter function") Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Link: https://lore.kernel.org/r/20211122222203.4103644-7-arnd@kernel.org Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/dma/mmp_pdma.c | 6 ------ drivers/dma/pxa_dma.c | 7 ------- 2 files changed, 13 deletions(-) diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c index eb3a1f42ab06..e8b2d3e31de8 100644 --- a/drivers/dma/mmp_pdma.c +++ b/drivers/dma/mmp_pdma.c @@ -722,12 +722,6 @@ static int mmp_pdma_config(struct dma_chan *dchan, chan->dir = cfg->direction; chan->dev_addr = addr; - /* FIXME: drivers should be ported over to use the filter - * function. Once that's done, the following two lines can - * be removed. - */ - if (cfg->slave_id) - chan->drcmr = cfg->slave_id; return 0; } diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c index b31c28b67ad3..c54986902b9d 100644 --- a/drivers/dma/pxa_dma.c +++ b/drivers/dma/pxa_dma.c @@ -960,13 +960,6 @@ static void pxad_get_config(struct pxad_chan *chan, *dcmd |= PXA_DCMD_BURST16; else if (maxburst == 32) *dcmd |= PXA_DCMD_BURST32; - - /* FIXME: drivers should be ported over to use the filter - * function. Once that's done, the following two lines can - * be removed. - */ - if (chan->cfg.slave_id) - chan->drcmr = chan->cfg.slave_id; } static struct dma_async_tx_descriptor * -- Gitee From 4407f10ebb0399bada9ba4e1b94c712bb1ab4f2b Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 14 Mar 2022 09:57:58 +0800 Subject: [PATCH 049/340] net-sysfs: update the queue counts in the unregistration path stable inclusion from linux-4.19.226 commit 35cad2003b6447932cfe91f795090586306738e8 -------------------------------- [ Upstream commit d7dac083414eb5bb99a6d2ed53dc2c1b405224e5 ] When updating Rx and Tx queue kobjects, the queue count should always be updated to match the queue kobjects count. This was not done in the net device unregistration path, fix it. Tracking all queue count updates will allow in a following up patch to detect illegal updates. Signed-off-by: Antoine Tenart Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/core/net-sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fe0d255d66c8..e5dc04cb5599 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1616,6 +1616,9 @@ static void remove_queue_kobjects(struct net_device *dev) net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); + + dev->real_num_rx_queues = 0; + dev->real_num_tx_queues = 0; #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif -- Gitee From 7ff83febc9ff2447df1bdf19eab137006239475a Mon Sep 17 00:00:00 2001 From: Suresh Kumar Date: Mon, 14 Mar 2022 09:57:59 +0800 Subject: [PATCH 050/340] net: bonding: debug: avoid printing debug logs when bond is not notifying peers stable inclusion from linux-4.19.226 commit 4cca06db20796f60ad83943b80664ef0d652dcac -------------------------------- [ Upstream commit fee32de284ac277ba434a2d59f8ce46528ff3946 ] Currently "bond_should_notify_peers: slave ..." messages are printed whenever "bond_should_notify_peers" function is called. +++ Dec 12 12:33:26 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:26 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:26 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:26 node1 kernel: bond0: (slave enp0s25): Received LACPDU on port 1 Dec 12 12:33:26 node1 kernel: bond0: (slave enp0s25): Rx Machine: Port=1, Last State=6, Curr State=6 Dec 12 12:33:26 node1 kernel: bond0: (slave enp0s25): partner sync=1 Dec 12 12:33:26 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:26 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:26 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 ... Dec 12 12:33:30 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:30 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:30 node1 kernel: bond0: (slave enp4s3): Received LACPDU on port 2 Dec 12 12:33:30 node1 kernel: bond0: (slave enp4s3): Rx Machine: Port=2, Last State=6, Curr State=6 Dec 12 12:33:30 node1 kernel: bond0: (slave enp4s3): partner sync=1 Dec 12 12:33:30 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:30 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 Dec 12 12:33:30 node1 kernel: bond0: bond_should_notify_peers: slave enp0s25 +++ This is confusing and can also clutter up debug logs. Print logs only when the peer notification happens. Signed-off-by: Suresh Kumar Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/net/bonding/bond_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c952ab169e4e..f4d40e983e99 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -782,9 +782,6 @@ static bool bond_should_notify_peers(struct bonding *bond) slave = rcu_dereference(bond->curr_active_slave); rcu_read_unlock(); - netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n", - slave ? slave->dev->name : "NULL"); - if (!slave || !bond->send_peer_notif || bond->send_peer_notif % max(1, bond->params.peer_notif_delay) != 0 || @@ -792,6 +789,9 @@ static bool bond_should_notify_peers(struct bonding *bond) test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) return false; + netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n", + slave ? slave->dev->name : "NULL"); + return true; } -- Gitee From cd2a66cec92953ac6159dd1f8f66072eb7e44e85 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 14 Mar 2022 09:58:00 +0800 Subject: [PATCH 051/340] bpf: Do not WARN in bpf_warn_invalid_xdp_action() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from linux-4.19.226 commit f6d5eb174ee9b9ca4911b98f243f149fbfd19a43 -------------------------------- [ Upstream commit 2cbad989033bff0256675c38f96f5faab852af4b ] The WARN_ONCE() in bpf_warn_invalid_xdp_action() can be triggered by any bugged program, and even attaching a correct program to a NIC not supporting the given action. The resulting splat, beyond polluting the logs, fouls automated tools: e.g. a syzkaller reproducers using an XDP program returning an unsupported action will never pass validation. Replace the WARN_ONCE with a less intrusive pr_warn_once(). Signed-off-by: Paolo Abeni Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/016ceec56e4817ebb2a9e35ce794d5c917df572c.1638189075.git.pabeni@redhat.com Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/core/filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 5f24a6b82802..0730395918e0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5528,9 +5528,9 @@ void bpf_warn_invalid_xdp_action(u32 act) { const u32 act_max = XDP_REDIRECT; - WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n", - act > act_max ? "Illegal" : "Driver unsupported", - act); + pr_warn_once("%s XDP return value %u, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -- Gitee From 5ccb974b936e362905267829ae33d209283d974a Mon Sep 17 00:00:00 2001 From: Kyeong Yoo Date: Mon, 14 Mar 2022 09:58:01 +0800 Subject: [PATCH 052/340] jffs2: GC deadlock reading a page that is used in jffs2_write_begin() stable inclusion from linux-4.19.226 commit c0b59abaf8effbac022f6d8b526c4b0b620920b3 -------------------------------- [ Upstream commit aa39cc675799bc92da153af9a13d6f969c348e82 ] GC task can deadlock in read_cache_page() because it may attempt to release a page that is actually allocated by another task in jffs2_write_begin(). The reason is that in jffs2_write_begin() there is a small window a cache page is allocated for use but not set Uptodate yet. This ends up with a deadlock between two tasks: 1) A task (e.g. file copy) - jffs2_write_begin() locks a cache page - jffs2_write_end() tries to lock "alloc_sem" from jffs2_reserve_space() <-- STUCK 2) GC task (jffs2_gcd_mtd3) - jffs2_garbage_collect_pass() locks "alloc_sem" - try to lock the same cache page in read_cache_page() <-- STUCK So to avoid this deadlock, hold "alloc_sem" in jffs2_write_begin() while reading data in a cache page. Signed-off-by: Kyeong Yoo Signed-off-by: Richard Weinberger Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/jffs2/file.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 7d8654a1472e..3047872fdac9 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -135,20 +135,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); pgoff_t index = pos >> PAGE_SHIFT; uint32_t pageofs = index << PAGE_SHIFT; int ret = 0; - pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) - return -ENOMEM; - *pagep = pg; - jffs2_dbg(1, "%s()\n", __func__); if (pageofs > inode->i_size) { /* Make new hole frag from old EOF to new page */ - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; uint32_t alloc_len; @@ -159,7 +154,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) - goto out_page; + goto out_err; mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); @@ -189,7 +184,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = PTR_ERR(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); if (f->metadata) { @@ -204,13 +199,26 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } jffs2_complete_reservation(c); inode->i_size = pageofs; mutex_unlock(&f->sem); } + /* + * While getting a page and reading data in, lock c->alloc_sem until + * the page is Uptodate. Otherwise GC task may attempt to read the same + * page in read_cache_page(), which causes a deadlock. + */ + mutex_lock(&c->alloc_sem); + pg = grab_cache_page_write_begin(mapping, index, flags); + if (!pg) { + ret = -ENOMEM; + goto release_sem; + } + *pagep = pg; + /* * Read in the page if it wasn't already present. Cannot optimize away * the whole page write case until jffs2_write_end can handle the @@ -220,15 +228,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); mutex_unlock(&f->sem); - if (ret) - goto out_page; + if (ret) { + unlock_page(pg); + put_page(pg); + goto release_sem; + } } jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); - return ret; -out_page: - unlock_page(pg); - put_page(pg); +release_sem: + mutex_unlock(&c->alloc_sem); +out_err: return ret; } -- Gitee From 547341181064479f3c3700fdc318bcdd31b192f5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 14 Mar 2022 09:58:02 +0800 Subject: [PATCH 053/340] ACPICA: Utilities: Avoid deleting the same object twice in a row stable inclusion from linux-4.19.226 commit e198165efa9032550f05441c78adee56e16850b4 -------------------------------- [ Upstream commit 1cdfe9e346b4c5509ffe19ccde880fd259d9f7a3 ] ACPICA commit c11af67d8f7e3d381068ce7771322f2b5324d687 If original_count is 0 in acpi_ut_update_ref_count (), acpi_ut_delete_internal_obj () is invoked for the target object, which is incorrect, because that object has been deleted once already and the memory allocated to store it may have been reclaimed and allocated for a different purpose by the host OS. Moreover, a confusing debug message following the "Reference Count is already zero, cannot decrement" warning is printed in that case. To fix this issue, make acpi_ut_update_ref_count () return after finding that original_count is 0 and printing the above warning. Link: https://github.com/acpica/acpica/commit/c11af67d Link: https://github.com/acpica/acpica/pull/652 Reported-by: Mark Asselstine Signed-off-by: Rafael J. Wysocki Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/acpi/acpica/utdelete.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/acpica/utdelete.c b/drivers/acpi/acpica/utdelete.c index 8cc4392c61f3..42e489cfa4d5 100644 --- a/drivers/acpi/acpica/utdelete.c +++ b/drivers/acpi/acpica/utdelete.c @@ -410,6 +410,7 @@ acpi_ut_update_ref_count(union acpi_operand_object *object, u32 action) ACPI_WARNING((AE_INFO, "Obj %p, Reference Count is already zero, cannot decrement\n", object)); + return; } ACPI_DEBUG_PRINT_RAW((ACPI_DB_ALLOCATIONS, -- Gitee From 3998ce7e7ebcac786a24a9358f9edd48b4084e1a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 14 Mar 2022 09:58:03 +0800 Subject: [PATCH 054/340] ACPICA: Executer: Fix the REFCLASS_REFOF case in acpi_ex_opcode_1A_0T_1R() stable inclusion from linux-4.19.226 commit addda1b65b5dfbbdff0f1e100aed0a72b8af5bb8 -------------------------------- [ Upstream commit 24ea5f90ec9548044a6209685c5010edd66ffe8f ] ACPICA commit d984f12041392fa4156b52e2f7e5c5e7bc38ad9e If Operand[0] is a reference of the ACPI_REFCLASS_REFOF class, acpi_ex_opcode_1A_0T_1R () calls acpi_ns_get_attached_object () to obtain return_desc which may require additional resolution with the help of acpi_ex_read_data_from_field (). If the latter fails, the reference counter of the original return_desc is decremented which is incorrect, because acpi_ns_get_attached_object () does not increment the reference counter of the object returned by it. This issue may lead to premature deletion of the attached object while it is still attached and a use-after-free and crash in the host OS. For example, this may happen when on evaluation of ref_of() a local region field where there is no registered handler for the given Operation Region. Fix it by making acpi_ex_opcode_1A_0T_1R () return Status right away after a acpi_ex_read_data_from_field () failure. Link: https://github.com/acpica/acpica/commit/d984f120 Link: https://github.com/acpica/acpica/pull/685 Reported-by: Lenny Szubowicz Signed-off-by: Rafael J. Wysocki Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/acpi/acpica/exoparg1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpica/exoparg1.c b/drivers/acpi/acpica/exoparg1.c index ba9fbae0cf91..319f4bc6a839 100644 --- a/drivers/acpi/acpica/exoparg1.c +++ b/drivers/acpi/acpica/exoparg1.c @@ -1007,7 +1007,8 @@ acpi_status acpi_ex_opcode_1A_0T_1R(struct acpi_walk_state *walk_state) (walk_state, return_desc, &temp_desc); if (ACPI_FAILURE(status)) { - goto cleanup; + return_ACPI_STATUS + (status); } return_desc = temp_desc; -- Gitee From bd623005181cfaec2ab7e2212835aca9e4926af9 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 14 Mar 2022 09:58:04 +0800 Subject: [PATCH 055/340] dm btree: add a defensive bounds check to insert_at() stable inclusion from linux-4.19.226 commit 716e490c87e4421d5908f7818fc6b61536a17744 -------------------------------- [ Upstream commit 85bca3c05b6cca31625437eedf2060e846c4bbad ] Corrupt metadata could trigger an out of bounds write. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/md/persistent-data/dm-btree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 8aae0624a297..6383afb88f31 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -83,14 +83,16 @@ void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, } static int insert_at(size_t value_size, struct btree_node *node, unsigned index, - uint64_t key, void *value) - __dm_written_to_disk(value) + uint64_t key, void *value) + __dm_written_to_disk(value) { uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); + uint32_t max_entries = le32_to_cpu(node->header.max_entries); __le64 key_le = cpu_to_le64(key); if (index > nr_entries || - index >= le32_to_cpu(node->header.max_entries)) { + index >= max_entries || + nr_entries >= max_entries) { DMERR("too many entries in btree node for insert"); __dm_unbless_for_disk(value); return -ENOMEM; -- Gitee From 37050d1b4cdd1fa154f9badd3de40964eb4075c5 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 14 Mar 2022 09:58:05 +0800 Subject: [PATCH 056/340] dm space map common: add bounds check to sm_ll_lookup_bitmap() stable inclusion from linux-4.19.226 commit 7d585b602a5c054e810b8f47183069872ac2fdf6 -------------------------------- [ Upstream commit cba23ac158db7f3cd48a923d6861bee2eb7a2978 ] Corrupted metadata could warrant returning error from sm_ll_lookup_bitmap(). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/md/persistent-data/dm-space-map-common.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index a284762e548e..5115a2719603 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -279,6 +279,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) struct disk_index_entry ie_disk; struct dm_block *blk; + if (b >= ll->nr_blocks) { + DMERR_LIMIT("metadata block out of bounds"); + return -EINVAL; + } + b = do_div(index, ll->entries_per_block); r = ll->load_ie(ll, index, &ie_disk); if (r < 0) -- Gitee From 51894623c7fc5480d207f302757fc7e12e9a4700 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Mar 2022 09:58:06 +0800 Subject: [PATCH 057/340] scsi: sr: Don't use GFP_DMA stable inclusion from linux-4.19.226 commit b24ef0a4974a95c9ee1aade3f39b90fc30ac552d -------------------------------- [ Upstream commit d94d94969a4ba07a43d62429c60372320519c391 ] The allocated buffers are used as a command payload, for which the block layer and/or DMA API do the proper bounce buffering if needed. Link: https://lore.kernel.org/r/20211222090842.920724-1-hch@lst.de Reported-by: Baoquan He Reviewed-by: Baoquan He Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/scsi/sr.c | 2 +- drivers/scsi/sr_vendor.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index be2daf5536ff..180087d1c6cd 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -885,7 +885,7 @@ static void get_capabilities(struct scsi_cd *cd) /* allocate transfer buffer */ - buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); + buffer = kmalloc(512, GFP_KERNEL); if (!buffer) { sr_printk(KERN_ERR, cd, "out of memory.\n"); return; diff --git a/drivers/scsi/sr_vendor.c b/drivers/scsi/sr_vendor.c index e3b0ce25162b..2887be4316be 100644 --- a/drivers/scsi/sr_vendor.c +++ b/drivers/scsi/sr_vendor.c @@ -119,7 +119,7 @@ int sr_set_blocklength(Scsi_CD *cd, int blocklength) density = (blocklength > 2048) ? 0x81 : 0x83; #endif - buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); + buffer = kmalloc(512, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -167,7 +167,7 @@ int sr_cd_check(struct cdrom_device_info *cdi) if (cd->cdi.mask & CDC_MULTI_SESSION) return 0; - buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); + buffer = kmalloc(512, GFP_KERNEL); if (!buffer) return -ENOMEM; -- Gitee From 8247bb0155d50b8ee404e9fa58f13523cac3b03d Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 14 Mar 2022 09:58:07 +0800 Subject: [PATCH 058/340] serial: Fix incorrect rs485 polarity on uart open stable inclusion from linux-4.19.226 commit 510aa43c9928ca38f1210b6c255373727d7ea312 -------------------------------- commit d3b3404df318504ec084213ab1065b73f49b0f1d upstream. Commit a6845e1e1b78 ("serial: core: Consider rs485 settings to drive RTS") sought to deassert RTS when opening an rs485-enabled uart port. That way, the transceiver does not occupy the bus until it transmits data. Unfortunately, the commit mixed up the logic and *asserted* RTS instead of *deasserting* it: The commit amended uart_port_dtr_rts(), which raises DTR and RTS when opening an rs232 port. "Raising" actually means lowering the signal that's coming out of the uart, because an rs232 transceiver not only changes a signal's voltage level, it also *inverts* the signal. See the simplified schematic in the MAX232 datasheet for an example: https://www.ti.com/lit/ds/symlink/max232.pdf So, to raise RTS on an rs232 port, TIOCM_RTS is *set* in port->mctrl and that results in the signal being driven low. In contrast to rs232, the signal level for rs485 Transmit Enable is the identity, not the inversion: If the transceiver expects a "high" RTS signal for Transmit Enable, the signal coming out of the uart must also be high, so TIOCM_RTS must be *cleared* in port->mctrl. The commit did the exact opposite, but it's easy to see why given the confusing semantics of rs232 and rs485. Fix it. Fixes: a6845e1e1b78 ("serial: core: Consider rs485 settings to drive RTS") Cc: stable@vger.kernel.org # v4.14+ Cc: Rafael Gago Castano Cc: Jan Kiszka Cc: Su Bao Cheng Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/9395767847833f2f3193c49cde38501eeb3b5669.1639821059.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/serial/serial_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 4e60045487a8..8d8d63c3ca7d 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -159,7 +159,7 @@ static void uart_port_dtr_rts(struct uart_port *uport, int raise) int RTS_after_send = !!(uport->rs485.flags & SER_RS485_RTS_AFTER_SEND); if (raise) { - if (rs485_on && !RTS_after_send) { + if (rs485_on && RTS_after_send) { uart_set_mctrl(uport, TIOCM_DTR); uart_clear_mctrl(uport, TIOCM_RTS); } else { @@ -168,7 +168,7 @@ static void uart_port_dtr_rts(struct uart_port *uport, int raise) } else { unsigned int clear = TIOCM_DTR; - clear |= (!rs485_on || !RTS_after_send) ? TIOCM_RTS : 0; + clear |= (!rs485_on || RTS_after_send) ? TIOCM_RTS : 0; uart_clear_mctrl(uport, clear); } } -- Gitee From 5b303f3d612c0f2d0a462aa3236b33f7e41dbb7a Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 14 Mar 2022 09:58:08 +0800 Subject: [PATCH 059/340] cputime, cpuacct: Include guest time in user time in cpuacct.stat stable inclusion from linux-4.19.226 commit 952514c8565cf72a966993b473fae1708c3684f3 -------------------------------- commit 9731698ecb9c851f353ce2496292ff9fcea39dff upstream. cpuacct.stat in no-root cgroups shows user time without guest time included int it. This doesn't match with user time shown in root cpuacct.stat and /proc//stat. This also affects cgroup2's cpu.stat in the same way. Make account_guest_time() to add user time to cgroup's cpustat to fix this. Fixes: ef12fefabf94 ("cpuacct: add per-cgroup utime/stime statistics") Signed-off-by: Andrey Ryabinin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Jordan Acked-by: Tejun Heo Cc: Link: https://lore.kernel.org/r/20211115164607.23784-1-arbn@yandex-team.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/sched/cputime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8dd27c1fbb29..0df2448eb5d1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -152,10 +152,10 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); cpustat[CPUTIME_GUEST] += cputime; } } -- Gitee From 825982c0c0d10628a2a635369c2608ff598f6218 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Mar 2022 09:58:09 +0800 Subject: [PATCH 060/340] ext4: make sure to reset inode lockdep class when quota enabling fails stable inclusion from linux-4.19.226 commit ef41f72716c469a670b9d556b65e5ed83a3a5fd7 -------------------------------- commit 4013d47a5307fdb5c13370b5392498b00fedd274 upstream. When we succeed in enabling some quota type but fail to enable another one with quota feature, we correctly disable all enabled quota types. However we forget to reset i_data_sem lockdep class. When the inode gets freed and reused, it will inherit this lockdep class (i_data_sem is initialized only when a slab is created) and thus eventually lockdep barfs about possible deadlocks. Reported-and-tested-by: syzbot+3b6f9218b1301ddda3e2@syzkaller.appspotmail.com Signed-off-by: Jan Kara Cc: stable@kernel.org Link: https://lore.kernel.org/r/20211007155336.12493-3-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/ext4/super.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f9d60ec607e..3100f4aa2d59 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6166,8 +6166,19 @@ static int ext4_enable_quotas(struct super_block *sb) "Failed to enable quota tracking " "(type=%d, err=%d). Please run " "e2fsck to fix.", type, err); - for (type--; type >= 0; type--) + for (type--; type >= 0; type--) { + struct inode *inode; + + inode = sb_dqopt(sb)->files[type]; + if (inode) + inode = igrab(inode); dquot_quota_off(sb, type); + if (inode) { + lockdep_set_quota_inode(inode, + I_DATA_SEM_NORMAL); + iput(inode); + } + } return err; } -- Gitee From 546ab4a5d01c18746055fc42d2ee82c9f56cd45b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Mar 2022 09:58:10 +0800 Subject: [PATCH 061/340] ext4: make sure quota gets properly shutdown on error stable inclusion from linux-4.19.226 commit 841bba6544e10cab41535b76bbdd37555a6ab9df -------------------------------- commit 15fc69bbbbbc8c72e5f6cc4e1be0f51283c5448e upstream. When we hit an error when enabling quotas and setting inode flags, we do not properly shutdown quota subsystem despite returning error from Q_QUOTAON quotactl. This can lead to some odd situations like kernel using quota file while it is still writeable for userspace. Make sure we properly cleanup the quota subsystem in case of error. Signed-off-by: Jan Kara Cc: stable@kernel.org Link: https://lore.kernel.org/r/20211007155336.12493-2-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/ext4/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3100f4aa2d59..f961f7d94a70 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6080,10 +6080,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); - if (err) { - lockdep_set_quota_inode(path->dentry->d_inode, - I_DATA_SEM_NORMAL); - } else { + if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; @@ -6103,7 +6100,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); + if (err) + dquot_quota_off(sb, type); } + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); return err; } -- Gitee From 7e65b84907dfa5c5b994213abf93ae1fe2b201fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= Date: Mon, 14 Mar 2022 09:58:11 +0800 Subject: [PATCH 062/340] ext4: set csum seed in tmp inode while migrating to extents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from linux-4.19.226 commit 9103cafdc4531285b6ededd0a3437effd71ff255 -------------------------------- commit e81c9302a6c3c008f5c30beb73b38adb0170ff2d upstream. When migrating to extents, the temporary inode will have it's own checksum seed. This means that, when swapping the inodes data, the inode checksums will be incorrect. This can be fixed by recalculating the extents checksums again. Or simply by copying the seed into the temporary inode. Link: https://bugzilla.kernel.org/show_bug.cgi?id=213357 Reported-by: Jeroen van Wolffelaar Signed-off-by: Luís Henriques Link: https://lore.kernel.org/r/20211214175058.19511-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/ext4/migrate.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 7eb036cd1d93..0b3194b59d12 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -457,6 +457,17 @@ int ext4_ext_migrate(struct inode *inode) ext4_journal_stop(handle); goto out_unlock; } + /* + * Use the correct seed for checksum (i.e. the seed from 'inode'). This + * is so that the metadata blocks will have the correct checksum after + * the migration. + * + * Note however that, if a crash occurs during the migration process, + * the recovery process is broken because the tmp_inode checksums will + * be wrong and the orphans cleanup will fail. + */ + ei = EXT4_I(inode); + EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later @@ -500,7 +511,6 @@ int ext4_ext_migrate(struct inode *inode) goto out_tmp_inode; } - ei = EXT4_I(inode); i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); -- Gitee From 277de38db93dde878c60df1c7071f76f8917f663 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 14 Mar 2022 09:58:12 +0800 Subject: [PATCH 063/340] ext4: don't use the orphan list when migrating an inode stable inclusion from linux-4.19.226 commit 33446496d21753b5ceb55be4d6e593b487a61239 -------------------------------- commit 6eeaf88fd586f05aaf1d48cb3a139d2a5c6eb055 upstream. We probably want to remove the indirect block to extents migration feature after a deprecation window, but until then, let's fix a potential data loss problem caused by the fact that we put the tmp_inode on the orphan list. In the unlikely case where we crash and do a journal recovery, the data blocks belonging to the inode being migrated are also represented in the tmp_inode on the orphan list --- and so its data blocks will get marked unallocated, and available for reuse. Instead, stop putting the tmp_inode on the oprhan list. So in the case where we crash while migrating the inode, we'll leak an inode, which is not a disaster. It will be easily fixed the next time we run fsck, and it's better than potentially having blocks getting claimed by two different files, and losing data as a result. Signed-off-by: Theodore Ts'o Reviewed-by: Lukas Czerner Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/ext4/migrate.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 0b3194b59d12..75a769634b2b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -435,12 +435,12 @@ int ext4_ext_migrate(struct inode *inode) percpu_down_write(&sbi->s_writepages_rwsem); /* - * Worst case we can touch the allocation bitmaps, a bgd - * block, and a block to link in the orphan list. We do need - * need to worry about credits for modifying the quota inode. + * Worst case we can touch the allocation bitmaps and a block + * group descriptor block. We do need need to worry about + * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, - 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); @@ -461,10 +461,6 @@ int ext4_ext_migrate(struct inode *inode) * Use the correct seed for checksum (i.e. the seed from 'inode'). This * is so that the metadata blocks will have the correct checksum after * the migration. - * - * Note however that, if a crash occurs during the migration process, - * the recovery process is broken because the tmp_inode checksums will - * be wrong and the orphans cleanup will fail. */ ei = EXT4_I(inode); EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; @@ -476,7 +472,6 @@ int ext4_ext_migrate(struct inode *inode) clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); - ext4_orphan_add(handle, tmp_inode); ext4_journal_stop(handle); /* @@ -501,12 +496,6 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { - /* - * It is impossible to update on-disk structures without - * a handle, so just rollback in-core changes and live other - * work to orphan_list_cleanup() - */ - ext4_orphan_del(NULL, tmp_inode); retval = PTR_ERR(handle); goto out_tmp_inode; } -- Gitee From d116bcc0a0ba75c2356381010feda7d54e506a34 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 14 Mar 2022 09:58:13 +0800 Subject: [PATCH 064/340] crypto: stm32/crc32 - Fix kernel BUG triggered in probe() stable inclusion from linux-4.19.226 commit e670c4b7c1ca134463211b27af69695a3adcb846 -------------------------------- commit 29009604ad4e3ef784fd9b9fef6f23610ddf633d upstream. The include/linux/crypto.h struct crypto_alg field cra_driver_name description states "Unique name of the transformation provider. " ... " this contains the name of the chip or provider and the name of the transformation algorithm." In case of the stm32-crc driver, field cra_driver_name is identical for all registered transformation providers and set to the name of the driver itself, which is incorrect. This patch fixes it by assigning a unique cra_driver_name to each registered transformation provider. The kernel crash is triggered when the driver calls crypto_register_shashes() which calls crypto_register_shash(), which calls crypto_register_alg(), which calls __crypto_register_alg(), which returns -EEXIST, which is propagated back through this call chain. Upon -EEXIST from crypto_register_shash(), the crypto_register_shashes() starts unregistering the providers back, and calls crypto_unregister_shash(), which calls crypto_unregister_alg(), and this is where the BUG() triggers due to incorrect cra_refcnt. Fixes: b51dbe90912a ("crypto: stm32 - Support for STM32 CRC32 crypto module") Signed-off-by: Marek Vasut Cc: # 4.12+ Cc: Alexandre Torgue Cc: Fabien Dessenne Cc: Herbert Xu Cc: Lionel Debieve Cc: Nicolas Toromanoff Cc: linux-arm-kernel@lists.infradead.org Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-crypto@vger.kernel.org Acked-by: Nicolas Toromanoff Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/crypto/stm32/stm32_crc32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/stm32/stm32_crc32.c b/drivers/crypto/stm32/stm32_crc32.c index 29d2095d9dfd..48c4a71d1cb3 100644 --- a/drivers/crypto/stm32/stm32_crc32.c +++ b/drivers/crypto/stm32/stm32_crc32.c @@ -217,7 +217,7 @@ static struct shash_alg algs[] = { .digestsize = CHKSUM_DIGEST_SIZE, .base = { .cra_name = "crc32", - .cra_driver_name = DRIVER_NAME, + .cra_driver_name = "stm32-crc32-crc32", .cra_priority = 200, .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, @@ -239,7 +239,7 @@ static struct shash_alg algs[] = { .digestsize = CHKSUM_DIGEST_SIZE, .base = { .cra_name = "crc32c", - .cra_driver_name = DRIVER_NAME, + .cra_driver_name = "stm32-crc32-crc32c", .cra_priority = 200, .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, -- Gitee From c26e893496ad4c8e7f04f37c2b672434ceef7368 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Mar 2022 09:58:14 +0800 Subject: [PATCH 065/340] af_unix: annote lockless accesses to unix_tot_inflight & gc_in_progress stable inclusion from linux-4.19.226 commit cf3c4b5912cb208194507a21f6209e8ae4e6c260 -------------------------------- commit 9d6d7f1cb67cdee15f1a0e85aacfb924e0e02435 upstream. wait_for_unix_gc() reads unix_tot_inflight & gc_in_progress without synchronization. Adds READ_ONCE()/WRITE_ONCE() and their associated comments to better document the intent. BUG: KCSAN: data-race in unix_inflight / wait_for_unix_gc write to 0xffffffff86e2b7c0 of 4 bytes by task 9380 on cpu 0: unix_inflight+0x1e8/0x260 net/unix/scm.c:63 unix_attach_fds+0x10c/0x1e0 net/unix/scm.c:121 unix_scm_to_skb net/unix/af_unix.c:1674 [inline] unix_dgram_sendmsg+0x679/0x16b0 net/unix/af_unix.c:1817 unix_seqpacket_sendmsg+0xcc/0x110 net/unix/af_unix.c:2258 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg net/socket.c:724 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2409 ___sys_sendmsg net/socket.c:2463 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2549 __do_sys_sendmmsg net/socket.c:2578 [inline] __se_sys_sendmmsg net/socket.c:2575 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2575 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffffffff86e2b7c0 of 4 bytes by task 9375 on cpu 1: wait_for_unix_gc+0x24/0x160 net/unix/garbage.c:196 unix_dgram_sendmsg+0x8e/0x16b0 net/unix/af_unix.c:1772 unix_seqpacket_sendmsg+0xcc/0x110 net/unix/af_unix.c:2258 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg net/socket.c:724 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2409 ___sys_sendmsg net/socket.c:2463 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2549 __do_sys_sendmmsg net/socket.c:2578 [inline] __se_sys_sendmmsg net/socket.c:2575 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2575 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000002 -> 0x00000004 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 9375 Comm: syz-executor.1 Not tainted 5.16.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 9915672d4127 ("af_unix: limit unix_tot_inflight") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220114164328.2038499-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/unix/garbage.c | 14 +++++++++++--- net/unix/scm.c | 6 ++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8bbe1b8e4ff7..4d283e26d816 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -197,8 +197,11 @@ void wait_for_unix_gc(void) { /* If number of inflight sockets is insane, * force a garbage collect right now. + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight() and gc_in_progress(). */ - if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false); } @@ -218,7 +221,9 @@ void unix_gc(void) if (gc_in_progress) goto out; - gc_in_progress = true; + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, true); + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. @@ -304,7 +309,10 @@ void unix_gc(void) /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); - gc_in_progress = false; + + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, false); + wake_up(&unix_gc_wait); out: diff --git a/net/unix/scm.c b/net/unix/scm.c index 8c40f2b32392..ce700b22ecce 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -59,7 +59,8 @@ void unix_inflight(struct user_struct *user, struct file *fp) } else { BUG_ON(list_empty(&u->link)); } - unix_tot_inflight++; + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } user->unix_inflight++; spin_unlock(&unix_gc_lock); @@ -79,7 +80,8 @@ void unix_notinflight(struct user_struct *user, struct file *fp) if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); - unix_tot_inflight--; + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); } user->unix_inflight--; spin_unlock(&unix_gc_lock); -- Gitee From dcc767345649d22d7433dc1c1704e26d86caa885 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Mar 2022 09:58:15 +0800 Subject: [PATCH 066/340] netns: add schedule point in ops_exit_list() stable inclusion from linux-4.19.226 commit 4048cedfd16a995b2ef4294b9539da17e2fea750 -------------------------------- commit 2836615aa22de55b8fca5e32fe1b27a67cda625e upstream. When under stress, cleanup_net() can have to dismantle netns in big numbers. ops_exit_list() currently calls many helpers [1] that have no schedule point, and we can end up with soft lockups, particularly on hosts with many cpus. Even for moderate amount of netns processed by cleanup_net() this patch avoids latency spikes. [1] Some of these helpers like fib_sync_up() and fib_sync_down_dev() are very slow because net/ipv4/fib_semantics.c uses host-wide hash tables, and ifindex is used as the only input of two hash functions. ifindexes tend to be the same for all netns (lo.ifindex==1 per instance) This will be fixed in a separate patch. Fixes: 72ad937abd0a ("net: Add support for batching network namespace cleanups") Signed-off-by: Eric Dumazet Cc: Eric W. Biederman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/core/net_namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c60123dff803..f4aacbb76fa5 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -149,8 +149,10 @@ static void ops_exit_list(const struct pernet_operations *ops, { struct net *net; if (ops->exit) { - list_for_each_entry(net, net_exit_list, exit_list) + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); + cond_resched(); + } } if (ops->exit_batch) ops->exit_batch(net_exit_list); -- Gitee From 969a3664489d338d03b49cc383dc3df7981aa0e9 Mon Sep 17 00:00:00 2001 From: "Doyle, Patrick" Date: Mon, 14 Mar 2022 09:58:16 +0800 Subject: [PATCH 067/340] mtd: nand: bbt: Fix corner case in bad block table handling stable inclusion from linux-4.19.226 commit 1550a97e4a5d8bf29071bd6c17355ca173e90f73 -------------------------------- commit fd0d8d85f7230052e638a56d1bfea170c488e6bc upstream. In the unlikely event that both blocks 10 and 11 are marked as bad (on a 32 bit machine), then the process of marking block 10 as bad stomps on cached entry for block 11. There are (of course) other examples. Signed-off-by: Patrick Doyle Reviewed-by: Richard Weinberger Signed-off-by: Yoshio Furuyama [: Fixed the title] Signed-off-by: Miquel Raynal Cc: Frieder Schrempf Link: https://lore.kernel.org/linux-mtd/774a92693f311e7de01e5935e720a179fb1b2468.1616635406.git.ytc-mb-yfuruyama7@kioxia.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/mtd/nand/bbt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/bbt.c b/drivers/mtd/nand/bbt.c index 044adf913854..64af6898131d 100644 --- a/drivers/mtd/nand/bbt.c +++ b/drivers/mtd/nand/bbt.c @@ -123,7 +123,7 @@ int nanddev_bbt_set_block_status(struct nand_device *nand, unsigned int entry, unsigned int rbits = bits_per_block + offs - BITS_PER_LONG; pos[1] &= ~GENMASK(rbits - 1, 0); - pos[1] |= val >> rbits; + pos[1] |= val >> (bits_per_block - rbits); } return 0; -- Gitee From 3dfc9fb2089afddb537aab262cf3bb69e7fb8a33 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Mar 2022 09:58:17 +0800 Subject: [PATCH 068/340] select: Fix indefinitely sleeping task in poll_schedule_timeout() stable inclusion from linux-4.19.227 commit 6717900f775a6129a7b4d03ba4922218d8bf1caa -------------------------------- commit 68514dacf2715d11b91ca50d88de047c086fea9c upstream. A task can end up indefinitely sleeping in do_select() -> poll_schedule_timeout() when the following race happens: TASK1 (thread1) TASK2 TASK1 (thread2) do_select() setup poll_wqueues table with 'fd' write data to 'fd' pollwake() table->triggered = 1 closes 'fd' thread1 is waiting for poll_schedule_timeout() - sees table->triggered table->triggered = 0 return -EINTR loop back in do_select() But at this point when TASK1 loops back, the fdget() in the setup of poll_wqueues fails. So now so we never find 'fd' is ready for reading and sleep in poll_schedule_timeout() indefinitely. Treat an fd that got closed as a fd on which some event happened. This makes sure cannot block indefinitely in do_select(). Another option would be to return -EBADF in this case but that has a potential of subtly breaking applications that excercise this behavior and it happens to work for them. So returning fd as active seems like a safer choice. Suggested-by: Linus Torvalds CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/select.c | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/select.c b/fs/select.c index b684f0dd6db8..67e4be14c00c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -458,9 +458,11 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) return max; } -#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR) -#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR) -#define POLLEX_SET (EPOLLPRI) +#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ + EPOLLNVAL) +#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ + EPOLLNVAL) +#define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, @@ -527,6 +529,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) break; if (!(bit & all_bits)) continue; + mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, @@ -534,34 +537,34 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) mask = vfs_poll(f.file, wait); fdput(f); - if ((mask & POLLIN_SET) && (in & bit)) { - res_in |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLOUT_SET) && (out & bit)) { - res_out |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLEX_SET) && (ex & bit)) { - res_ex |= bit; - retval++; - wait->_qproc = NULL; - } - /* got something, stop busy polling */ - if (retval) { - can_busy_loop = false; - busy_flag = 0; - - /* - * only remember a returned - * POLL_BUSY_LOOP if we asked for it - */ - } else if (busy_flag & mask) - can_busy_loop = true; - } + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + wait->_qproc = NULL; + } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } if (res_in) *rinp = res_in; -- Gitee From faba439fce06cc5bf270889214dd947a9fd3f65b Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Mon, 14 Mar 2022 10:50:19 +0800 Subject: [PATCH 069/340] nfc: st21nfca: Fix potential buffer overflows in EVT_TRANSACTION mainline inclusion from mainline-v5.17-rc1 commit 4fbcc1a4cb20fe26ad0225679c536c80f1648221 category: bugfix bugzilla: 186393 CVE: CVE-2022-26490 ------------------------------------------------- It appears that there are some buffer overflows in EVT_TRANSACTION. This happens because the length parameters that are passed to memcpy come directly from skb->data and are not guarded in any way. Signed-off-by: Jordy Zomer Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Huang Guobin Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Laibin Qiu --- drivers/nfc/st21nfca/se.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index fd967a38a94a..ced3c20d6453 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -332,6 +332,11 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, return -ENOMEM; transaction->aid_len = skb->data[1]; + + /* Checking if the length of the AID is valid */ + if (transaction->aid_len > sizeof(transaction->aid)) + return -EINVAL; + memcpy(transaction->aid, &skb->data[2], transaction->aid_len); @@ -341,6 +346,11 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, return -EPROTO; transaction->params_len = skb->data[transaction->aid_len + 3]; + + /* Total size is allocated (skb->len - 2) minus fixed array members */ + if (transaction->params_len > ((skb->len - 2) - sizeof(struct nfc_evt_transaction))) + return -EINVAL; + memcpy(transaction->params, skb->data + transaction->aid_len + 4, transaction->params_len); -- Gitee From 612e8299ecc7897799df1be7703f7f7f6d6f2a9c Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Mon, 14 Mar 2022 14:57:32 +0800 Subject: [PATCH 070/340] mm: fix unable to use reliable memory in page cache hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA ---------------------------------------------- It is inaccurate when accumulate percpu variable without lock, it will result in pagecache_reliable_pages to be negative in sometime, and will prevent pagecache using reliable memory. For more accurate statistic, replace percpu variable by percpu_counter. The additional percpu_counter will be access in alloc_pages, the init of these two percpu_conter is too late in late_initcall, allocations that use alloc_pages should have to check if the two counter has been inited, that will introduce latency, in order to sovle this, init the two percpu_counter in advance. Signed-off-by: Chen Wandun Reviewed-by: Kefeng Wang Signed-off-by: Laibin Qiu --- include/linux/mem_reliable.h | 17 +++++----- mm/mem_reliable.c | 62 ++++++++++++++++++------------------ mm/page_alloc.c | 5 +++ 3 files changed, 45 insertions(+), 39 deletions(-) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index ba5d41edbc44..c4f954340ea7 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -21,8 +21,9 @@ extern bool shmem_reliable; extern struct percpu_counter reliable_shmem_used_nr_page; extern bool pagecache_use_reliable_mem; DECLARE_PER_CPU(long, nr_reliable_buddy_pages); -DECLARE_PER_CPU(long, pagecache_reliable_pages); -DECLARE_PER_CPU(long, anon_reliable_pages); + +extern struct percpu_counter pagecache_reliable_pages; +extern struct percpu_counter anon_reliable_pages; extern unsigned long nr_reliable_reserve_pages __read_mostly; extern long shmem_reliable_nr_page __read_mostly; @@ -43,6 +44,7 @@ extern void reliable_lru_add(enum lru_list lru, struct page *page, extern void page_cache_prepare_alloc(gfp_t *gfp); extern void reliable_lru_add_batch(int zid, enum lru_list lru, int val); +extern bool mem_reliable_counter_initialized(void); static inline bool mem_reliable_is_enabled(void) { @@ -81,13 +83,10 @@ static inline void reliable_page_counter(struct page *page, static inline bool reliable_mem_limit_check(unsigned long nr_page) { - int cpu; - long num = 0; + s64 num; - for_each_possible_cpu(cpu) { - num += per_cpu(pagecache_reliable_pages, cpu); - num += per_cpu(anon_reliable_pages, cpu); - } + num = percpu_counter_read_positive(&pagecache_reliable_pages); + num += percpu_counter_read_positive(&anon_reliable_pages); return num + nr_page <= task_reliable_limit / PAGE_SIZE; } @@ -184,6 +183,8 @@ static inline void reliable_lru_add(enum lru_list lru, static inline void page_cache_prepare_alloc(gfp_t *gfp) {} static inline void reliable_lru_add_batch(int zid, enum lru_list lru, int val) {} + +static inline bool mem_reliable_counter_initialized(void) { return false; } #endif #endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 62df78657ff9..033af716610f 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -34,12 +34,18 @@ unsigned long nr_reliable_reserve_pages = MEM_RELIABLE_RESERVE_MIN / PAGE_SIZE; long shmem_reliable_nr_page = LONG_MAX; bool pagecache_use_reliable_mem __read_mostly = true; -DEFINE_PER_CPU(long, pagecache_reliable_pages); -DEFINE_PER_CPU(long, anon_reliable_pages); +struct percpu_counter pagecache_reliable_pages; +struct percpu_counter anon_reliable_pages; static unsigned long zero; static unsigned long reliable_pagecache_max_bytes = ULONG_MAX; +bool mem_reliable_counter_initialized(void) +{ + return likely(percpu_counter_initialized(&pagecache_reliable_pages)) && + likely((percpu_counter_initialized(&anon_reliable_pages))); +} + bool mem_reliable_status(void) { return mem_reliable_is_enabled(); @@ -66,9 +72,9 @@ void reliable_lru_add_batch(int zid, enum lru_list lru, int val) if (zid < ZONE_MOVABLE && zid >= 0) { if (is_file_lru(lru)) - this_cpu_add(pagecache_reliable_pages, val); + percpu_counter_add(&pagecache_reliable_pages, val); else if (is_anon_lru(lru)) - this_cpu_add(anon_reliable_pages, val); + percpu_counter_add(&anon_reliable_pages, val); } } @@ -78,14 +84,14 @@ void reliable_lru_add(enum lru_list lru, struct page *page, int val) return; if (is_file_lru(lru)) - this_cpu_add(pagecache_reliable_pages, val); + percpu_counter_add(&pagecache_reliable_pages, val); else if (is_anon_lru(lru)) - this_cpu_add(anon_reliable_pages, val); + percpu_counter_add(&anon_reliable_pages, val); else if (lru == LRU_UNEVICTABLE) { if (PageAnon(page)) - this_cpu_add(anon_reliable_pages, val); + percpu_counter_add(&anon_reliable_pages, val); else - this_cpu_add(pagecache_reliable_pages, val); + percpu_counter_add(&pagecache_reliable_pages, val); } } @@ -188,21 +194,20 @@ static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) void reliable_report_meminfo(struct seq_file *m) { bool pagecache_enabled = pagecache_reliable_is_enabled(); - long nr_pagecache_pages = 0; - long nr_anon_pages = 0; + s64 nr_pagecache_pages = 0; + s64 nr_anon_pages = 0; long nr_buddy_pages = 0; int cpu; if (!mem_reliable_is_enabled()) return; - for_each_possible_cpu(cpu) { + for_each_possible_cpu(cpu) nr_buddy_pages += per_cpu(nr_reliable_buddy_pages, cpu); - nr_anon_pages += per_cpu(anon_reliable_pages, cpu); - if (pagecache_enabled) - nr_pagecache_pages += - per_cpu(pagecache_reliable_pages, cpu); - } + + nr_anon_pages = percpu_counter_sum_positive(&anon_reliable_pages); + if (pagecache_enabled) + nr_pagecache_pages = percpu_counter_sum_positive(&pagecache_reliable_pages); show_val_kb(m, "ReliableTotal: ", total_reliable_mem_sz() >> PAGE_SHIFT); @@ -445,8 +450,7 @@ static struct ctl_table reliable_dir_table[] = { void page_cache_prepare_alloc(gfp_t *gfp) { - long nr_reliable = 0; - int cpu; + s64 nr_reliable = 0; if (!mem_reliable_is_enabled()) return; @@ -454,11 +458,7 @@ void page_cache_prepare_alloc(gfp_t *gfp) if (!pagecache_reliable_is_enabled()) goto no_reliable; - for_each_possible_cpu(cpu) - nr_reliable += this_cpu_read(pagecache_reliable_pages); - - if (nr_reliable < 0) - goto no_reliable; + nr_reliable = percpu_counter_read_positive(&pagecache_reliable_pages); if (nr_reliable > reliable_pagecache_max_bytes >> PAGE_SHIFT) goto no_reliable; @@ -480,9 +480,12 @@ static int __init reliable_sysctl_init(void) return -1; } + percpu_counter_init(&pagecache_reliable_pages, 0, GFP_KERNEL); + percpu_counter_init(&anon_reliable_pages, 0, GFP_KERNEL); + return 0; } -late_initcall(reliable_sysctl_init); +arch_initcall(reliable_sysctl_init); #else static void mem_reliable_ctrl_bit_disabled(int idx) {} #endif @@ -515,21 +518,18 @@ static void mem_reliable_feature_disable(int idx) void reliable_show_mem_info(void) { - int cpu; - long num = 0; + s64 num = 0; if (!mem_reliable_is_enabled()) return; - for_each_possible_cpu(cpu) { - num += per_cpu(anon_reliable_pages, cpu); - num += per_cpu(pagecache_reliable_pages, cpu); - } + num += percpu_counter_sum_positive(&anon_reliable_pages); + num += percpu_counter_sum_positive(&pagecache_reliable_pages); pr_info("ReliableTotal: %lu kB", total_reliable_mem_sz() >> 10); pr_info("ReliableUsed: %lu kB", used_reliable_mem_sz() >> 10); pr_info("task_reliable_limit: %lu kB", task_reliable_limit >> 10); - pr_info("reliable_user_used: %ld kB", num << (PAGE_SHIFT - 10)); + pr_info("reliable_user_used: %lld kB", num << (PAGE_SHIFT - 10)); } void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd4354d9ebad..dab62d1d3a6e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4674,6 +4674,11 @@ static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) * allocation trigger task_reliable_limit */ if (is_global_init(current)) { + if (!mem_reliable_counter_initialized()) { + *gfp_mask |= ___GFP_RELIABILITY; + return true; + } + if (reliable_mem_limit_check(1 << order) && mem_reliable_watermark_ok(1 << order)) *gfp_mask |= ___GFP_RELIABILITY; -- Gitee From 216ce413fb52be3da2dd38e854c185bc61dfcfd6 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Mon, 14 Mar 2022 14:57:33 +0800 Subject: [PATCH 071/340] mm: introduce "clear_freelist" kernel parameter hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4XB9H CVE: NA -------------------------------- CONFIG_CLEAR_FREELIST_PAGE specifies the default value for clear freelist. Add a kernel parameter to make it possible to override the default. Keep clear_freelist disabled unless "clear_freelist" is explicitly specified. Signed-off-by: Yu Liao Reviewed-by: Kefeng Wang Signed-off-by: Laibin Qiu --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ mm/clear_freelist_page.c | 11 ++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a46b2fe191ba..2cb6c844d751 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -516,6 +516,10 @@ cio_ignore= [S390] See Documentation/s390/CommonIO for details. + + clear_freelist + Enable clear_freelist feature. + clk_ignore_unused [CLK] Prevents the clock framework from automatically gating diff --git a/mm/clear_freelist_page.c b/mm/clear_freelist_page.c index 69975f458dc7..f0a13b8fdcdb 100644 --- a/mm/clear_freelist_page.c +++ b/mm/clear_freelist_page.c @@ -154,9 +154,18 @@ static struct ctl_table sys_ctl_table[] = { { } }; +static bool clear_freelist_enabled; +static int __init setup_clear_freelist(char *str) +{ + clear_freelist_enabled = true; + return 1; +} +__setup("clear_freelist", setup_clear_freelist); + static int __init clear_freelist_init(void) { - register_sysctl_table(sys_ctl_table); + if (clear_freelist_enabled) + register_sysctl_table(sys_ctl_table); return 0; } -- Gitee From 6fb0d251f7687d8e9519db87b07275d04e5b35e9 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Mon, 14 Mar 2022 21:56:59 +0800 Subject: [PATCH 072/340] mm: disable memory reliable when kdump is in progress hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Kdump only have limited memory and will lead to bugly memory reliable features if memory reliable if enabled. So disable memory reliable if kdump is in progress. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Laibin Qiu --- mm/mem_reliable.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 033af716610f..5505577d3784 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -9,6 +9,7 @@ #include #include #include +#include #define MEM_RELIABLE_RESERVE_MIN (256UL << 20) @@ -128,6 +129,11 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) if (!reliable_enabled) return; + if (is_kdump_kernel()) { + pr_err("init failed, the kdump is in progress\n"); + return; + } + if (atomic_long_read(&total_reliable_mem) == 0) { memset(zone_movable_pfn, 0, sizeof(unsigned long) * MAX_NUMNODES); -- Gitee From 9b6c51cd780588813c5d327e2a6d677d010a2b6f Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Mon, 14 Mar 2022 21:57:00 +0800 Subject: [PATCH 073/340] mm: fix zoneref mapping problem in memory reliable hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- The mapping between zoneref and zone will be updated if __GFP_THISNODE is defined and memory reliable fallback is enabled. This will put ZONE_MOVABLE in a wrong zonerefs slot and lead to the origin zone unselectable. With this patch, high_zoneidx is updated via gfp_zone() which ___GFP_RELIABILITY is removed from the origin gfp_mask. Perferred zoneref is recalculated after. Fixes: 3023a4b35d41 ("mm: Introduce fallback mechanism for memory reliable") Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Laibin Qiu --- mm/page_alloc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dab62d1d3a6e..f4d4716b9049 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3672,21 +3672,21 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, } #ifdef CONFIG_MEMORY_RELIABLE -static inline struct zone *reliable_fb_find_zone(gfp_t gfp_mask, - struct alloc_context *ac) +static inline void reliable_fb_find_zone(gfp_t gfp_mask, + struct alloc_context *ac) { if (!reliable_allow_fb_enabled()) - return NULL; + return; - /* dst nodemask may don't have zone we want, fallback here */ + /* dst node don't have zone we want, fallback here */ if ((gfp_mask & __GFP_THISNODE) && (ac->high_zoneidx == ZONE_NORMAL) && (gfp_mask & ___GFP_RELIABILITY)) { - struct zoneref *ref = first_zones_zonelist( - ac->zonelist, ZONE_MOVABLE, ac->nodemask); - return ref->zone; + ac->high_zoneidx = gfp_zone(gfp_mask & ~___GFP_RELIABILITY); + ac->preferred_zoneref = first_zones_zonelist( + ac->zonelist, ac->high_zoneidx, ac->nodemask); } - return NULL; + return; } static inline struct page * @@ -3712,10 +3712,10 @@ reliable_fb_before_oom(gfp_t gfp_mask, int order, return NULL; } #else -static inline struct zone *reliable_fb_find_zone(gfp_t gfp_mask, - struct alloc_context *ac) +static inline void reliable_fb_find_zone(gfp_t gfp_mask, + struct alloc_context *ac) { - return NULL; + return; } static inline struct page *reliable_fb_before_oom(gfp_t gfp_mask, int order, @@ -4375,8 +4375,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) { - ac->preferred_zoneref->zone = - reliable_fb_find_zone(gfp_mask, ac); + reliable_fb_find_zone(gfp_mask, ac); if (!ac->preferred_zoneref->zone) goto nopage; -- Gitee From fde6ae6107f086d2bbfcf52a6abae99ae41b953a Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Mon, 14 Mar 2022 21:57:01 +0800 Subject: [PATCH 074/340] mm: Count reliable shmem used based on NR_SHMEM hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA ------------------------------------------ With this patch, reliable memory counter will be updated when NR_SHMEM is updated. Pervious shmem reliable memory counter is not accurate if swap is enabled. NR_SHMEM update in memcg secenario is ignored because this has nothing to do with the global counter. If shmem pages is migrated or collapsed from one region to another region, reliable memory counter need to be updated because these pages's reliable status may not be the same. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Laibin Qiu --- mm/filemap.c | 5 ++++- mm/khugepaged.c | 2 ++ mm/migrate.c | 5 +++++ mm/shmem.c | 7 ++----- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index a89d70097e68..9b6e72e14a04 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -291,6 +291,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + shmem_reliable_page_counter(page, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else { @@ -895,8 +896,10 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) */ if (!PageHuge(new)) __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) + if (PageSwapBacked(new)) { __inc_node_page_state(new, NR_SHMEM); + shmem_reliable_page_counter(new, 1); + } xa_unlock_irqrestore(&mapping->i_pages, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 301d6aa079d7..2975fc124cb6 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1559,6 +1559,7 @@ static void collapse_shmem(struct mm_struct *mm, ClearPageActive(page); ClearPageUnevictable(page); unlock_page(page); + shmem_reliable_page_counter(page, -1); put_page(page); index++; } @@ -1573,6 +1574,7 @@ static void collapse_shmem(struct mm_struct *mm, mem_cgroup_commit_charge(new_page, memcg, false, true); count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_anon(new_page); + shmem_reliable_page_counter(new_page, 1 << HPAGE_PMD_ORDER); /* * Remove pte page tables, so we can re-fault the page as huge. diff --git a/mm/migrate.c b/mm/migrate.c index f7721d0aece5..ecfa8829acfd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -548,6 +548,11 @@ int migrate_page_move_mapping(struct address_space *mapping, xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ + if (PageSwapBacked(page) && !PageSwapCache(page)) { + shmem_reliable_page_counter(page, -nr); + shmem_reliable_page_counter(newpage, nr); + } + /* * If moved to a different zone then also account * the page for that zone. Other VM counters will be diff --git a/mm/shmem.c b/mm/shmem.c index 4363dbc8d57e..8915a5b9ad0a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -733,6 +733,7 @@ static int shmem_add_to_page_cache(struct page *page, __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); + shmem_reliable_page_counter(page, nr); xa_unlock_irq(&mapping->i_pages); } else { page->mapping = NULL; @@ -758,6 +759,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_SHMEM); + shmem_reliable_page_counter(page, -1); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); @@ -962,8 +964,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, truncate_inode_page(mapping, page); } } - shmem_reliable_page_counter( - page, -(1 << compound_order(page))); unlock_page(page); } pagevec_remove_exceptionals(&pvec); @@ -1074,8 +1074,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, break; } } - shmem_reliable_page_counter( - page, -(1 << compound_order(page))); unlock_page(page); } pagevec_remove_exceptionals(&pvec); @@ -1981,7 +1979,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - shmem_reliable_page_counter(page, 1 << compound_order(page)); alloced = true; if (PageTransHuge(page) && -- Gitee From 46e70fd20c9b8c6b15ff9cb7750b818f44fa52c8 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 14 Mar 2022 22:28:57 +0800 Subject: [PATCH 075/340] lib/iov_iter: initialize "flags" in new pipe_buffer mainline inclusion from mainline-v5.17-rc6 commit 9d2231c5d74e13b2a0546fee6737ee4446017903 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4X1GI?from=project-issue CVE: NA -------------------------------- The functions copy_page_to_iter_pipe() and push_pipe() can both allocate a new pipe_buffer, but the "flags" member initializer is missing. Fixes: 241699cd72a8 ("new iov_iter flavour: pipe-backed") To: Alexander Viro To: linux-fsdevel@vger.kernel.org To: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Max Kellermann Signed-off-by: Al Viro Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Laibin Qiu --- lib/iov_iter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 25668139fc1f..b80320956caf 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -506,6 +506,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by return 0; pipe->nrbufs++; buf->ops = &page_cache_pipe_buf_ops; + buf->flags = 0; get_page(buf->page = page); buf->offset = offset; buf->len = bytes; @@ -630,6 +631,7 @@ static size_t push_pipe(struct iov_iter *i, size_t size, break; pipe->nrbufs++; pipe->bufs[idx].ops = &default_pipe_buf_ops; + pipe->bufs[idx].flags = 0; pipe->bufs[idx].page = page; pipe->bufs[idx].offset = 0; if (left <= PAGE_SIZE) { -- Gitee From 939559f58fc75629a99eb4003ea3f3ada52dea2f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 15 Mar 2022 10:04:17 +0800 Subject: [PATCH 076/340] crypto: pcrypt - Fix user-after-free on module unload stable inclusion from linux-4.19.102 commit 47ef5cb878817127bd3d54c3578bbbd3f7c2bf2c CVE: NA ------------------------------- [ Upstream commit 07bfd9bdf568a38d9440c607b72342036011f727 ] On module unload of pcrypt we must unregister the crypto algorithms first and then tear down the padata structure. As otherwise the crypto algorithms are still alive and can be used while the padata structure is being freed. Fixes: 5068c7a883d1 ("crypto: pcrypt - Add pcrypt crypto...") Cc: Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Reviewed-by: Wang Weiyang Signed-off-by: Laibin Qiu --- crypto/pcrypt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index 00c7230acfed..4bf4bb5b1a8b 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -509,11 +509,12 @@ static int __init pcrypt_init(void) static void __exit pcrypt_exit(void) { + crypto_unregister_template(&pcrypt_tmpl); + pcrypt_fini_padata(&pencrypt); pcrypt_fini_padata(&pdecrypt); kset_unregister(pcrypt_kset); - crypto_unregister_template(&pcrypt_tmpl); } module_init(pcrypt_init); -- Gitee From 2632c58b0aa3a8b5b0d921961f7360f4510a1d7d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Mar 2022 20:40:44 +0800 Subject: [PATCH 077/340] veth: Do not record rx queue hint in veth_xmit stable inclusion from linux-4.19.226 commit bd6e97e2b6f59a19894c7032a83f03ad38ede28e -------------------------------- commit 710ad98c363a66a0cd8526465426c5c5f8377ee0 upstream. Laurent reported that they have seen a significant amount of TCP retransmissions at high throughput from applications residing in network namespaces talking to the outside world via veths. The drops were seen on the qdisc layer (fq_codel, as per systemd default) of the phys device such as ena or virtio_net due to all traffic hitting a _single_ TX queue _despite_ multi-queue device. (Note that the setup was _not_ using XDP on veths as the issue is generic.) More specifically, after edbea9220251 ("veth: Store queue_mapping independently of XDP prog presence") which made it all the way back to v4.19.184+, skb_record_rx_queue() would set skb->queue_mapping to 1 (given 1 RX and 1 TX queue by default for veths) instead of leaving at 0. This is eventually retained and callbacks like ena_select_queue() will also pick single queue via netdev_core_pick_tx()'s ndo_select_queue() once all the traffic is forwarded to that device via upper stack or other means. Similarly, for others not implementing ndo_select_queue() if XPS is disabled, netdev_pick_tx() might call into the skb_tx_hash() and check for prior skb_rx_queue_recorded() as well. In general, it is a _bad_ idea for virtual devices like veth to mess around with queue selection [by default]. Given dev->real_num_tx_queues is by default 1, the skb->queue_mapping was left untouched, and so prior to edbea9220251 the netdev_core_pick_tx() could do its job upon __dev_queue_xmit() on the phys device. Unbreak this and restore prior behavior by removing the skb_record_rx_queue() from veth_xmit() altogether. If the veth peer has an XDP program attached, then it would return the first RX queue index in xdp_md->rx_queue_index (unless configured in non-default manner). However, this is still better than breaking the generic case. Fixes: edbea9220251 ("veth: Store queue_mapping independently of XDP prog presence") Fixes: 638264dc9022 ("veth: Support per queue XDP ring") Reported-by: Laurent Bernaille Signed-off-by: Daniel Borkmann Cc: Maciej Fijalkowski Cc: Toshiaki Makita Cc: Eric Dumazet Cc: Paolo Abeni Cc: John Fastabend Cc: Willem de Bruijn Acked-by: John Fastabend Reviewed-by: Eric Dumazet Acked-by: Toshiaki Makita Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Conflicts: drivers/net/veth.c Signed-off-by: Ziyang Xuan Reviewed-by: Wei Yongjun Signed-off-by: Laibin Qiu --- drivers/net/veth.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 41a00cd76955..749faa6fcd82 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -197,8 +197,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; rcv_xdp = rcu_access_pointer(rq->xdp_prog); - if (rcv_xdp) - skb_record_rx_queue(skb, rxq); } if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { -- Gitee From a9140c0834eacdc16f3f15f0fa8199c2769fdf1e Mon Sep 17 00:00:00 2001 From: Cui GaoSheng Date: Thu, 17 Mar 2022 20:40:45 +0800 Subject: [PATCH 078/340] Revert "audit: bugfix for infinite loop when flush the hold queue" hulk inclusion category: bugfix bugzilla: 186384 https://gitee.com/openeuler/kernel/issues/I4X1AI?from=project-issue CVE: NA -------------------------------- This reverts commit 67ab712f2b0e7448052f3b8fdee727fb7e204448. Signed-off-by: Cui GaoSheng Reviewed-by: Xiu Jianfeng Signed-off-by: Laibin Qiu --- kernel/audit.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index 3de5ebb94559..c5e034fe14bb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -740,8 +740,6 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, if (!sk) { if (err_hook) (*err_hook)(skb); - if (queue == &audit_hold_queue) - goto out; continue; } @@ -758,8 +756,6 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, (*err_hook)(skb); if (rc == -EAGAIN) rc = 0; - if (queue == &audit_hold_queue) - goto out; /* continue to drain the queue */ continue; } else @@ -771,7 +767,6 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, } } -out: return (rc >= 0 ? 0 : rc); } -- Gitee From 1518b80ba8be3561e002dbc91b6583e66b840c67 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 17 Mar 2022 20:40:46 +0800 Subject: [PATCH 079/340] audit: improve audit queue handling when "audit=1" on cmdline mainline inclusion from mainline-v5.17-rc3 commit f26d04331360d42dbd6b58448bd98e4edbfbe1c5 category: bugfix bugzilla: 186384 https://gitee.com/openeuler/kernel/issues/I4X1AI?from=project-issue CVE: NA -------------------------------- When an admin enables audit at early boot via the "audit=1" kernel command line the audit queue behavior is slightly different; the audit subsystem goes to greater lengths to avoid dropping records, which unfortunately can result in problems when the audit daemon is forcibly stopped for an extended period of time. This patch makes a number of changes designed to improve the audit queuing behavior so that leaving the audit daemon in a stopped state for an extended period does not cause a significant impact to the system. - kauditd_send_queue() is now limited to looping through the passed queue only once per call. This not only prevents the function from looping indefinitely when records are returned to the current queue, it also allows any recovery handling in kauditd_thread() to take place when kauditd_send_queue() returns. - Transient netlink send errors seen as -EAGAIN now cause the record to be returned to the retry queue instead of going to the hold queue. The intention of the hold queue is to store, perhaps for an extended period of time, the events which led up to the audit daemon going offline. The retry queue remains a temporary queue intended to protect against transient issues between the kernel and the audit daemon. - The retry queue is now limited by the audit_backlog_limit setting, the same as the other queues. This allows admins to bound the size of all of the audit queues on the system. - kauditd_rehold_skb() now returns records to the end of the hold queue to ensure ordering is preserved in the face of recent changes to kauditd_send_queue(). Cc: stable@vger.kernel.org Fixes: 5b52330bbfe63 ("audit: fix auditd/kernel connection state tracking") Fixes: f4b3ee3c85551 ("audit: improve robustness of the audit queue handling") Reported-by: Gaosheng Cui Tested-by: Gaosheng Cui Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore Signed-off-by: Cui GaoSheng Reviewed-by: Xiu Jianfeng Signed-off-by: Laibin Qiu --- kernel/audit.c | 62 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index c5e034fe14bb..7dc14a4d9e3c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -549,20 +549,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -572,19 +574,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; } - /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -593,24 +607,32 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); } /** @@ -648,7 +670,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); } /** @@ -722,16 +744,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ - while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -739,7 +763,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, -ECONNREFUSED); continue; } @@ -753,7 +777,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ -- Gitee From ab77435872e7cf1efa7ac84914b87e6f5530c187 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Fri, 18 Mar 2022 18:00:43 +0800 Subject: [PATCH 080/340] blk-mq: add exception handling when srcu->sda alloc failed hulk inclusion category: bugfix bugzilla: 186352, https://gitee.com/openeuler/kernel/issues/I4YADX DTS: DTS2022031707143 CVE: NA -------------------------------- In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch critical area. But the current process is not aware when memory of srcu allocation failed in blk_mq_alloc_hctx, which will leads to illegal address BUG. Add return value validation to avoid this problem. Signed-off-by: Laibin Qiu Reviewed-by: Hou Tao --- block/blk-mq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0732bcc65f88..9604d2b39745 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2459,12 +2459,16 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, if (!hctx->fq) goto free_bitmap; - if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->srcu); + if (hctx->flags & BLK_MQ_F_BLOCKING) { + if (init_srcu_struct(hctx->srcu) != 0) + goto free_flush_queue; + } blk_mq_hctx_kobj_init(hctx); return hctx; + free_flush_queue: + blk_free_flush_queue(hctx->fq); free_bitmap: sbitmap_free(&hctx->ctx_map); free_ctxs: -- Gitee From c61648a11933cb33878a55ab2610347b7406a6fc Mon Sep 17 00:00:00 2001 From: Mao HongBo Date: Fri, 18 Mar 2022 17:51:31 +0800 Subject: [PATCH 081/340] irqchip/gic-phytium-2500: Fix issue that interrupts are concentrated in one cpu Phytium inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I41AUQ CVE: NA ------------------------------------------------- Fix the issue that interrupts are concentrated in one cpu for Phytium S2500 server. Signed-off-by: Mao HongBo Signed-off-by: Zheng Zengkai Reviewed-by: Hanjun Guo Signed-off-by: Laibin Qiu --- drivers/irqchip/irq-gic-phytium-2500-its.c | 3 +-- drivers/irqchip/irq-gic-phytium-2500.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-gic-phytium-2500-its.c b/drivers/irqchip/irq-gic-phytium-2500-its.c index fff1c8546d23..dd24af3793ca 100644 --- a/drivers/irqchip/irq-gic-phytium-2500-its.c +++ b/drivers/irqchip/irq-gic-phytium-2500-its.c @@ -1181,8 +1181,7 @@ static int its_cpumask_select(struct its_device *its_dev, } cpu = cpumask_any_and(mask_val, cpu_mask); - if ((cpu > cpus) && (cpu < (cpus + skt_cpu_cnt[skt_id]))) - cpus = cpu; + cpus = cpus + cpu % skt_cpu_cnt[skt_id]; if (is_kdump_kernel()) { skt = (cpu_logical_map(cpu) >> 16) & 0xff; diff --git a/drivers/irqchip/irq-gic-phytium-2500.c b/drivers/irqchip/irq-gic-phytium-2500.c index 8674463a08c6..103e97f5855e 100644 --- a/drivers/irqchip/irq-gic-phytium-2500.c +++ b/drivers/irqchip/irq-gic-phytium-2500.c @@ -1123,8 +1123,7 @@ static int gic_cpumask_select(struct irq_data *d, const struct cpumask *mask_val } cpu = cpumask_any_and(mask_val, cpu_online_mask); - if ((cpu > cpus) && (cpu < (cpus + skt_cpu_cnt[irq_skt]))) - cpus = cpu; + cpus = cpus + cpu % skt_cpu_cnt[irq_skt]; if (is_kdump_kernel()) { skt = (cpu_logical_map(cpu) >> 16) & 0xff; -- Gitee From b89cbe688b97261a4c924a74417f275aa5b25fb9 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Fri, 18 Mar 2022 09:43:49 +0000 Subject: [PATCH 082/340] livepatch/core: Check klp_func before 'klp_init_object_loaded' hulk inclusion category: feature bugzilla: 186346, https://gitee.com/openeuler/kernel/issues/I4WBFN CVE: NA -------------------------------- Refer to following procedure: klp_init_object klp_init_object_loaded klp_find_object_symbol <-- 1. oops happened when old_name is NULL!!! klp_init_func <-- 2. currently old_name is first time check here This problem was introduced in commit 453d38459f94 ("livepatch/arm64: fix func size less than limit") which exchange order of 'klp_init_func' and 'klp_init_object_loaded' then cause old_name being used before check. We move these checks before 'klp_init_object_loaded' and add several logs to tell why check failed. Fixes: 453d38459f94 ("livepatch/arm64: fix func size less than limit") Signed-off-by: Zheng Yejian Reviewed-by: Cheng Jian Signed-off-by: Yongqiang Liu --- kernel/livepatch/core.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 66a7c4befa0e..89b31e827425 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1010,12 +1010,6 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) int ret; #endif - if (!func->old_name || !func->new_func) - return -EINVAL; - - if (strlen(func->old_name) >= KSYM_NAME_LEN) - return -EINVAL; - #ifdef CONFIG_LIVEPATCH_WO_FTRACE ret = arch_klp_func_can_patch(func); if (ret) @@ -1105,6 +1099,16 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) return -EINVAL; + klp_for_each_func(obj, func) { + if (!func->old_name || !func->new_func) { + pr_err("old_name or new_func is invalid\n"); + return -EINVAL; + } + if (strlen(func->old_name) >= KSYM_NAME_LEN) { + pr_err("old_name is too long\n"); + return -EINVAL; + } + } obj->patched = false; obj->mod = NULL; -- Gitee From f91c9577095c3595db6485e6362833dcd8e0e54e Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 21 Mar 2022 08:00:20 +0000 Subject: [PATCH 083/340] ext4: Fix symlink file size not match to file content hulk inclusion category: bugfix bugzilla: 186450, https://gitee.com/openeuler/kernel/issues/I4YSJ7 CVE: NA ----------------------------------------------- We got issue as follows: [root@yebin home]# fsck.ext4 -fn ram0yb e2fsck 1.45.6 (20-Mar-2020) Pass 1: Checking inodes, blocks, and sizes Pass 2: Checking directory structure Symlink /p3/d14/d1a/l3d (inode #3494) is invalid. Clear? no Entry 'l3d' in /p3/d14/d1a (3383) has an incorrect filetype (was 7, should be 0). Fix? no As symlink file size not match to file content. If symlink data block writback failed, will call ext4_finish_bio to end io. In this path don't mark buffer error. When umount do checkpoint can't detect buffer error, then will cleanup jounral. Actually, correct data maybe in journal area. To solve this issue, mark buffer error when detect bio error in ext4_finish_bio. Signed-off-by: Ye Bin Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/ext4/page-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9cc79b7b0df1..3de933354a08 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -105,8 +105,10 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_status) + if (bio->bi_status) { + set_buffer_write_io_error(bh); buffer_io_error(bh); + } } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); local_irq_restore(flags); -- Gitee From 06ff79d54e9f2dc74706976176eced61e31a5b54 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 21 Mar 2022 08:00:21 +0000 Subject: [PATCH 084/340] blk-throtl: fix race in io dispatching hulk inclusion category: bugfix bugzilla: 186449, https://gitee.com/openeuler/kernel/issues/I4YSPC CVE: NA -------------------------------- If io is throttled, such io will be issued by blk_throtl_dispatch_work_fn() or blk_throtl_drain(), and the io is fetched by throtl_pop_queued(). throtl_pop_queued() should be protected by 'queue_lock', as what blk_throtl_dispatch_work_fn() does. However, it's not protected in blk_throtl_drain(), which may lead to concurrent bio_list_pop(), and may end up crashing the kernel. Fix the problem by protecting throtl_pop_queued() through 'queue_lock' in blk_throtl_drain(). Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Yongqiang Liu --- block/blk-throttle.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 1e6d5631669b..95d28cf8ff0f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2422,8 +2422,11 @@ void blk_throtl_drain(struct request_queue *q) struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; struct bio *bio; + struct bio_list bio_list_on_stack; int rw; + bio_list_init(&bio_list_on_stack); + queue_lockdep_assert_held(q); rcu_read_lock(); @@ -2440,12 +2443,16 @@ void blk_throtl_drain(struct request_queue *q) tg_drain_bios(&td->service_queue); rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); /* all bios now should be in td->service_queue, issue them */ for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], NULL))) + bio_list_add(&bio_list_on_stack, bio); + spin_unlock_irq(q->queue_lock); + + if (!bio_list_empty(&bio_list_on_stack)) + while ((bio = bio_list_pop(&bio_list_on_stack))) generic_make_request(bio); spin_lock_irq(q->queue_lock); -- Gitee From ee8a1d431a68787cefd21ab451ddc8b62b95d43c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 Mar 2022 09:37:30 +0000 Subject: [PATCH 085/340] fuse: fix pipe buffer lifetime for direct_io mainline inclusion from mainline-v5.17-rc8 commit 0c4bcfdecb1ac0967619ee7ff44871d93c08c909 category: bugfix bugzilla: 186448, https://gitee.com/openeuler/kernel/issues/I4YORE CVE: CVE-2022-1011 -------------------------------- In FOPEN_DIRECT_IO mode, fuse_file_write_iter() calls fuse_direct_write_iter(), which normally calls fuse_direct_io(), which then imports the write buffer with fuse_get_user_pages(), which uses iov_iter_get_pages() to grab references to userspace pages instead of actually copying memory. On the filesystem device side, these pages can then either be read to userspace (via fuse_dev_read()), or splice()d over into a pipe using fuse_dev_splice_read() as pipe buffers with &nosteal_pipe_buf_ops. This is wrong because after fuse_dev_do_read() unlocks the FUSE request, the userspace filesystem can mark the request as completed, causing write() to return. At that point, the userspace filesystem should no longer have access to the pipe buffer. Fix by copying pages coming from the user address space to new pipe buffers. Reported-by: Jann Horn Fixes: c3021629a0d8 ("fuse: support splice() reading from fuse device") Cc: Signed-off-by: Miklos Szeredi Conflicts: fs/fuse/file.c fs/fuse/fuse_i.h Signed-off-by: Zhang Wensheng Reviewed-by: Tao Hou Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- fs/fuse/dev.c | 12 +++++++++++- fs/fuse/file.c | 6 ++++-- fs/fuse/fuse_i.h | 3 +++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8a98ffd5e4b6..bd5293e01e48 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -999,7 +999,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->in.user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ecabacabd47..8d238d965db1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1321,10 +1321,12 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } - if (write) + if (write) { + req->in.user_pages = 1; req->in.argpages = 1; - else + } else { req->out.argpages = 1; + } *nbytesp = nbytes; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 853d37ec81e0..27f3b6d4f11c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -174,6 +174,9 @@ struct fuse_in { /** True if the data for the last argument is in req->pages */ unsigned argpages:1; + /** True if direct write */ + unsigned user_pages:1; + /** Number of arguments */ unsigned numargs; -- Gitee From 108567834dad53630b3684734a60195fe1462bda Mon Sep 17 00:00:00 2001 From: Zhang Wensheng Date: Mon, 21 Mar 2022 09:37:31 +0000 Subject: [PATCH 086/340] kabi: fix kabi broken in struct fuse_in mainline inclusion from mainline-v5.17-rc8 commit 0c4bcfdecb1ac0967619ee7ff44871d93c08c909 category: bugfix bugzilla: 186448, https://gitee.com/openeuler/kernel/issues/I4YORE CVE: CVE-2022-1011 -------------------------------- Because create a new user_pages in fuse_in, to fix kabi change. Signed-off-by: Zhang Wensheng Reviewed-by: Tao Hou Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- fs/fuse/fuse_i.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 27f3b6d4f11c..0981011c10c6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -174,8 +174,10 @@ struct fuse_in { /** True if the data for the last argument is in req->pages */ unsigned argpages:1; +#ifndef __GENKSYMS__ /** True if direct write */ unsigned user_pages:1; +#endif /** Number of arguments */ unsigned numargs; -- Gitee From 1b70444ce952d714d51fded4ec1e162428fa539f Mon Sep 17 00:00:00 2001 From: Moritz Fischer Date: Tue, 22 Mar 2022 13:53:07 +0000 Subject: [PATCH 087/340] ACPI/IORT: Do not blindly trust DMA masks from firmware mainline inclusion from mainline-v5.11-rc6 commit a1df829ead5877d4a1061e976a50e2e665a16f24 category: bugfix bugzilla: https://gitee.com/openeuler/qemu/issues/I4WE3Y CVE: NA -------------------------------- Address issue observed on real world system with suboptimal IORT table where DMA masks of PCI devices would get set to 0 as result. iort_dma_setup() would query the root complex'/named component IORT entry for a DMA mask, and use that over the one the device has been configured with earlier. Ideally we want to use the minimum mask of what the IORT contains for the root complex and what the device was configured with. Fixes: 5ac65e8c8941 ("ACPI/IORT: Support address size limit for root complexes") Signed-off-by: Moritz Fischer Reviewed-by: Robin Murphy Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20210122012419.95010-1-mdf@kernel.org Signed-off-by: Catalin Marinas Conflicts: drivers/acpi/arm64/iort.c Signed-off-by: Xiongfeng Wang Reviewed-by: Hanjun Guo Signed-off-by: Yongqiang Liu --- drivers/acpi/arm64/iort.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index bd35cfe6776d..320db806994b 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -997,6 +997,11 @@ static int nc_dma_get_range(struct device *dev, u64 *size) ncomp = (struct acpi_iort_named_component *)node->node_data; + if (!ncomp->memory_address_limit) { + pr_warn(FW_BUG "Named component missing memory address limit\n"); + return -EINVAL; + } + *size = ncomp->memory_address_limit >= 64 ? U64_MAX : 1ULL<memory_address_limit; @@ -1016,6 +1021,11 @@ static int rc_dma_get_range(struct device *dev, u64 *size) rc = (struct acpi_iort_root_complex *)node->node_data; + if (!rc->memory_address_limit) { + pr_warn(FW_BUG "Root complex missing memory address limit\n"); + return -EINVAL; + } + *size = rc->memory_address_limit >= 64 ? U64_MAX : 1ULL<memory_address_limit; @@ -1072,8 +1082,8 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) * retrieved from firmware. */ dev->bus_dma_mask = mask; - dev->coherent_dma_mask = mask; - *dev->dma_mask = mask; + dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask); + *dev->dma_mask = min(*dev->dma_mask, mask); } *dma_addr = dmaaddr; -- Gitee From afea170045a0cb86eb59d5871ff96635e35c8a22 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 22 Mar 2022 13:53:08 +0000 Subject: [PATCH 088/340] xfs: fix use after free in buf log item unlock assert mainline inclusion from mainline-v5.1-rc5 commit 4d09807f20462d6edf04f6e98d3d47bcdf7a5e2f category: bugfix bugzilla: 186464, https://gitee.com/openeuler/kernel/issues/I4YYIZ -------------------------------- The xfs_buf_log_item ->iop_unlock() callback asserts that the buffer is unlocked when either non-stale or aborted. This assert occurs after the bli refcount has been dropped and the log item potentially freed. The aborted check is thus a potential use after free. This problem has been reproduced with KASAN enabled via generic/475. Fix up xfs_buf_item_unlock() to query aborted state before the bli reference is dropped to prevent a potential use after free. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Guo Xuenan Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/xfs/xfs_buf_item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 4309cd07331a..9b37a362de35 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -605,6 +605,8 @@ xfs_buf_item_unlock( #if defined(DEBUG) || defined(XFS_WARN) bool ordered = bip->bli_flags & XFS_BLI_ORDERED; bool dirty = bip->bli_flags & XFS_BLI_DIRTY; + bool aborted = test_bit(XFS_LI_ABORTED, + &lip->li_flags); #endif trace_xfs_buf_item_unlock(bip); @@ -633,7 +635,7 @@ xfs_buf_item_unlock( released = xfs_buf_item_put(bip); if (hold || (stale && !released)) return; - ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + ASSERT(!stale || aborted); xfs_buf_relse(bp); } -- Gitee From a4e359845552c638c790807d4888fd5b61ad9c7a Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Tue, 22 Mar 2022 13:53:09 +0000 Subject: [PATCH 089/340] xfs: Fix possible null-pointer dereferences in xchk_da_btree_block_check_sibling() mainline inclusion from mainline-v5.3-rc2 commit afa1d96d1430c2138c545fb76e6dcb21222098d4 category: bugfix bugzilla: 186464, https://gitee.com/openeuler/kernel/issues/I4YYIZ -------------------------------- In xchk_da_btree_block_check_sibling(), there is an if statement on line 274 to check whether ds->state->altpath.blk[level].bp is NULL: if (ds->state->altpath.blk[level].bp) When ds->state->altpath.blk[level].bp is NULL, it is used on line 281: xfs_trans_brelse(..., ds->state->altpath.blk[level].bp); struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); Thus, possible null-pointer dereferences may occur. To fix these bugs, ds->state->altpath.blk[level].bp is checked before being used. These bugs are found by a static analysis tool STCheck written by us. Signed-off-by: Jia-Ju Bai Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Guo Xuenan Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/xfs/scrub/dabtree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index f1260b4bfdee..17b6dde71bfe 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -286,7 +286,11 @@ xchk_da_btree_block_check_sibling( /* Compare upper level pointer to sibling pointer. */ if (ds->state->altpath.blk[level].blkno != sibling) xchk_da_set_corrupt(ds, level); - xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp); + if (ds->state->altpath.blk[level].bp) { + xfs_trans_brelse(ds->dargs.trans, + ds->state->altpath.blk[level].bp); + ds->state->altpath.blk[level].bp = NULL; + } out: return error; } -- Gitee From 76f51e9eaec87bf30d59e3dbf85785396f199944 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 22 Mar 2022 13:53:10 +0000 Subject: [PATCH 090/340] xfs: fix an undefined behaviour in _da3_path_shift mainline inclusion from mainline-v5.6-rc4 commit 4982bff1ace1196843f55536fcd4cc119738fe39 category: bugfix bugzilla: 186464, https://gitee.com/openeuler/kernel/issues/I4YYIZ -------------------------------- In xfs_da3_path_shift() "blk" can be assigned to state->path.blk[-1] if state->path.active is 1 (which is a valid state) when it tries to add an entry to a single dir leaf block and then to shift forward to see if there's a sibling block that would be a better place to put the new entry. This causes a UBSAN warning given negative array indices are undefined behavior in C. In practice the warning is entirely harmless given that "blk" is never dereferenced in this case, but it is still better to fix up the warning and slightly improve the code. UBSAN: Undefined behaviour in fs/xfs/libxfs/xfs_da_btree.c:1989:14 index -1 is out of range for type 'xfs_da_state_blk_t [5]' Call trace: dump_backtrace+0x0/0x2c8 show_stack+0x20/0x2c dump_stack+0xe8/0x150 __ubsan_handle_out_of_bounds+0xe4/0xfc xfs_da3_path_shift+0x860/0x86c [xfs] xfs_da3_node_lookup_int+0x7c8/0x934 [xfs] xfs_dir2_node_addname+0x2c8/0xcd0 [xfs] xfs_dir_createname+0x348/0x38c [xfs] xfs_create+0x6b0/0x8b4 [xfs] xfs_generic_create+0x12c/0x1f8 [xfs] xfs_vn_mknod+0x3c/0x4c [xfs] xfs_vn_create+0x34/0x44 [xfs] do_last+0xd4c/0x10c8 path_openat+0xbc/0x2f4 do_filp_open+0x74/0xf4 do_sys_openat2+0x98/0x180 __arm64_sys_openat+0xf8/0x170 do_el0_svc+0x170/0x240 el0_sync_handler+0x150/0x250 el0_sync+0x164/0x180 Suggested-by: Christoph Hellwig Signed-off-by: Qian Cai Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Guo Xuenan Conflicts: fs/xfs/libxfs/xfs_da_btree.c Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/xfs/libxfs/xfs_da_btree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 376bee94b5dd..2f0c9610dabc 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -1882,7 +1882,8 @@ xfs_da3_path_shift( ASSERT(path != NULL); ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ - for (blk = &path->blk[level]; level >= 0; blk--, level--) { + for (; level >= 0; level--) { + blk = &path->blk[level]; node = blk->bp->b_addr; dp->d_ops->node_hdr_from_disk(&nodehdr, node); btree = dp->d_ops->node_tree_p(node); -- Gitee From 0213acd04df8410a630af3f97489117ce9c11c61 Mon Sep 17 00:00:00 2001 From: Luo Meng Date: Tue, 22 Mar 2022 13:53:11 +0000 Subject: [PATCH 091/340] io_uring: fix UAF in get_files_struct() hulk inclusion category: bugfix bugzilla: 186337, https://gitee.com/openeuler/kernel/issues/I4XA09 CVE: NA -------------------------------- If two tasks are running concurrently as follows: task1 | task2 io_uring_enter | io_wqe_worker io_submit_sqes | io_submit_sqe | io_queue_sqe | io_req_defer | io_req_defer_prep | io_prep_work_files | io_grab_files | req->work.files = current->files | io_queue_async_work | __io_queue_async_work | io_wq_enqueue | io_wqe_insert_work | | io_worker_handle_work | io_impersonate_work | current->files = work->files And then, one of the concurrency UAF can be shown as below: free use (task3 ls -l /proc/io_wqe_worker id/fd ) do_exit // tsk = current = work->files | exit_files | put_files_struct | tsk->files // tsk->files = work->files | | iterate_dir | proc_readfd_common | p = get_proc_task(file_inode(file)) | get_files_struct | files = task->files | atomic_inc(&files->count) The root cause of UAF bugs is when get req->work.files doesn't add refcount. The mainline commit 0f2122045b94(io_uring: don't rely on weak ->files references) fixes this problem, based on this commit to resolved the problme. Signed-off-by: Luo Meng Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7240b5423170..a21d1d67d725 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5613,6 +5613,7 @@ static void __io_clean_op(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); req->flags &= ~REQ_F_INFLIGHT; + put_files_struct(req->work.files); } } @@ -5990,7 +5991,7 @@ static int io_grab_files(struct io_kiocb *req) if (fcheck(ctx->ring_fd) == ctx->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; - req->work.files = current->files; + req->work.files = get_files_struct(current); ret = 0; } spin_unlock_irq(&ctx->inflight_lock); -- Gitee From b630f785e12895877121724de737020fff8f0f9e Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Tue, 22 Mar 2022 13:53:12 +0000 Subject: [PATCH 092/340] sock: remove one redundant SKB_FRAG_PAGE_ORDER macro mainline inclusion from mainline-v5.15-rc1 commit 723783d077e39c256a1fafebbd97cbb14207c28f category: bugfix bugzilla: 186409, https://gitee.com/openeuler/kernel/issues/I4Z0V2 CVE: CVE-2022-0886 -------------------------------- Both SKB_FRAG_PAGE_ORDER are defined to the same value in net/core/sock.c and drivers/vhost/net.c. Move the SKB_FRAG_PAGE_ORDER definition to net/core/sock.h, as both net/core/sock.c and drivers/vhost/net.c include it, and it seems a reasonable file to put the macro. Signed-off-by: Yunsheng Lin Acked-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Xu Jia conflict: drivers/vhost/net.c Reviewed-by: Wei Yongjun Reviewed-by: Wang Weiyang Signed-off-by: Yongqiang Liu --- include/net/sock.h | 2 ++ net/core/sock.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 803464e66e02..b001ddbed3f3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2584,6 +2584,8 @@ extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; +#define SKB_FRAG_PAGE_ORDER get_order(32768) + static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ diff --git a/net/core/sock.c b/net/core/sock.c index 66a21528c739..246abac5e8d2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2229,9 +2229,6 @@ static void sk_leave_memory_pressure(struct sock *sk) } } -/* On 32bit arches, an skb frag is limited to 2^15 */ -#define SKB_FRAG_PAGE_ORDER get_order(32768) - /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get -- Gitee From 019f9120dc4f9e5598519eb65f824d8a588c6dd8 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 22 Mar 2022 13:53:13 +0000 Subject: [PATCH 093/340] esp: Fix possible buffer overflow in ESP transformation mainline inclusion from mainline commit ebe48d368e97d007bfeb76fcb065d6cfc4c96645 category: bugfix bugzilla: 186409, https://gitee.com/openeuler/kernel/issues/I4Z0V2 CVE: CVE-2022-0886 -------------------------------- The maximum message size that can be send is bigger than the maximum site that skb_page_frag_refill can allocate. So it is possible to write beyond the allocated buffer. Fix this by doing a fallback to COW in that case. v2: Avoid get get_order() costs as suggested by Linus Torvalds. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Reported-by: valis Signed-off-by: Steffen Klassert Signed-off-by: Xu Jia Reviewed-by: Wei Yongjun Reviewed-by: Wang Weiyang Signed-off-by: Yongqiang Liu --- include/net/esp.h | 2 ++ net/ipv4/esp4.c | 5 +++++ net/ipv6/esp6.c | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/include/net/esp.h b/include/net/esp.h index 117652eb6ea3..465e38890ee9 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 0792a9e2a555..45bb5b60b994 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -275,6 +275,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { @@ -284,6 +285,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 25317d5ccf2c..e3abfc97fde7 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -241,6 +241,11 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; + + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { -- Gitee From 04c20fc89eb4c7b9ce6ecc5695538f4ef3b45ac2 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Wed, 23 Mar 2022 11:38:31 +0000 Subject: [PATCH 094/340] swiotlb: fix info leak with DMA_FROM_DEVICE mainline inclusion from mainline-v5.17-rc6 commit ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e category: bugfix bugzilla: 186478, https://gitee.com/openeuler/kernel/issues/I4Z86P CVE: CVE-2022-0854 -------------------------------- The problem I'm addressing was discovered by the LTP test covering cve-2018-1000204. A short description of what happens follows: 1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV and a corresponding dxferp. The peculiar thing about this is that TUR is not reading from the device. 2) In sg_start_req() the invocation of blk_rq_map_user() effectively bounces the user-space buffer. As if the device was to transfer into it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in sg_build_indirect()") we make sure this first bounce buffer is allocated with GFP_ZERO. 3) For the rest of the story we keep ignoring that we have a TUR, so the device won't touch the buffer we prepare as if the we had a DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device and the buffer allocated by SG is mapped by the function virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here scatter-gather and not scsi generics). This mapping involves bouncing via the swiotlb (we need swiotlb to do virtio in protected guest like s390 Secure Execution, or AMD SEV). 4) When the SCSI TUR is done, we first copy back the content of the second (that is swiotlb) bounce buffer (which most likely contains some previous IO data), to the first bounce buffer, which contains all zeros. Then we copy back the content of the first bounce buffer to the user-space buffer. 5) The test case detects that the buffer, which it zero-initialized, ain't all zeros and fails. One can argue that this is an swiotlb problem, because without swiotlb we leak all zeros, and the swiotlb should be transparent in a sense that it does not affect the outcome (if all other participants are well behaved). Copying the content of the original buffer into the swiotlb buffer is the only way I can think of to make swiotlb transparent in such scenarios. So let's do just that if in doubt, but allow the driver to tell us that the whole mapped buffer is going to be overwritten, in which case we can preserve the old behavior and avoid the performance impact of the extra bounce. Signed-off-by: Halil Pasic Signed-off-by: Christoph Hellwig Conflicts: Documentation/core-api/dma-attributes.rst include/linux/dma-mapping.h kernel/dma/swiotlb.c Signed-off-by: Liu Shixin Reviewed-by: Xiu Jianfeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- Documentation/DMA-attributes.txt | 8 ++++++++ include/linux/dma-mapping.h | 8 ++++++++ kernel/dma/swiotlb.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt index 8f8d97f65d73..34742fc00ec0 100644 --- a/Documentation/DMA-attributes.txt +++ b/Documentation/DMA-attributes.txt @@ -156,3 +156,11 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_OVERWRITE +------------------ + +This is a hint to the DMA-mapping subsystem that the device is expected to +overwrite the entire mapped size, thus the caller does not require any of the +previous buffer contents to be preserved. This allows bounce-buffering +implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 9d39d97e09f4..aa23cade6991 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -70,6 +70,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. * It can be given to a device to use as a DMA source or target. A CPU cannot diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 2a8c41f12d45..1624d12b60af 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -584,7 +584,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, for (i = 0; i < nslots; i++) io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || + dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); return tlb_addr; -- Gitee From 3f80e18671eff79521b2eab06526446c40bf71bf Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Wed, 23 Mar 2022 11:38:32 +0000 Subject: [PATCH 095/340] swiotlb: rework "fix info leak with DMA_FROM_DEVICE" mainline inclusion from mainline-v5.17-rc8 commit aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13 category: bugfix bugzilla: 186478, https://gitee.com/openeuler/kernel/issues/I4Z86P CVE: CVE-2022-0854 -------------------------------- Unfortunately, we ended up merging an old version of the patch "fix info leak with DMA_FROM_DEVICE" instead of merging the latest one. Christoph (the swiotlb maintainer), he asked me to create an incremental fix (after I have pointed this out the mix up, and asked him for guidance). So here we go. The main differences between what we got and what was agreed are: * swiotlb_sync_single_for_device is also required to do an extra bounce * We decided not to introduce DMA_ATTR_OVERWRITE until we have exploiters * The implantation of DMA_ATTR_OVERWRITE is flawed: DMA_ATTR_OVERWRITE must take precedence over DMA_ATTR_SKIP_CPU_SYNC Thus this patch removes DMA_ATTR_OVERWRITE, and makes swiotlb_sync_single_for_device() bounce unconditionally (that is, also when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale data from the swiotlb buffer. Let me note, that if the size used with dma_sync_* API is less than the size used with dma_[un]map_*, under certain circumstances we may still end up with swiotlb not being transparent. In that sense, this is no perfect fix either. To get this bullet proof, we would have to bounce the entire mapping/bounce buffer. For that we would have to figure out the starting address, and the size of the mapping in swiotlb_sync_single_for_device(). While this does seem possible, there seems to be no firm consensus on how things are supposed to work. Signed-off-by: Halil Pasic Fixes: ddbd89deb7d3 ("swiotlb: fix info leak with DMA_FROM_DEVICE") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Linus Torvalds Conflicts: Documentation/core-api/dma-attributes.rst include/linux/dma-mapping.h kernel/dma/swiotlb.c Signed-off-by: Liu Shixin Reviewed-by: Xiu Jianfeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- Documentation/DMA-attributes.txt | 8 -------- include/linux/dma-mapping.h | 8 -------- kernel/dma/swiotlb.c | 26 ++++++++++++++++---------- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt index 34742fc00ec0..8f8d97f65d73 100644 --- a/Documentation/DMA-attributes.txt +++ b/Documentation/DMA-attributes.txt @@ -156,11 +156,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). - -DMA_ATTR_OVERWRITE ------------------- - -This is a hint to the DMA-mapping subsystem that the device is expected to -overwrite the entire mapped size, thus the caller does not require any of the -previous buffer contents to be preserved. This allows bounce-buffering -implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index aa23cade6991..9d39d97e09f4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -70,14 +70,6 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) -/* - * This is a hint to the DMA-mapping subsystem that the device is expected - * to overwrite the entire mapped size, thus the caller does not require any - * of the previous buffer contents to be preserved. This allows - * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. - */ -#define DMA_ATTR_OVERWRITE (1UL << 10) - /* * A dma_addr_t can hold any valid DMA or bus address for the platform. * It can be given to a device to use as a DMA source or target. A CPU cannot diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1624d12b60af..b6eee8ea867b 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -583,11 +583,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, */ for (i = 0; i < nslots; i++) io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || - dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); - + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); return tlb_addr; } @@ -679,11 +682,14 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, BUG_ON(dir != DMA_TO_DEVICE); break; case SYNC_FOR_DEVICE: - if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); + /* + * Unconditional bounce is necessary to avoid corruption on + * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite + * the whole lengt of the bounce buffer. + */ + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_TO_DEVICE); + BUG_ON(!valid_dma_direction(dir)); break; default: BUG(); -- Gitee From c7c20ad04c0ff233149e7c5108344bfa19c88a3a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 24 Mar 2022 04:19:04 +0000 Subject: [PATCH 096/340] hugetlb: Add huge page alloced limit hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4YTLN CVE: NA -------------------------------- The user wants to reserve a certain amount of memory for normal non-huge page, that is, the hugetlb can't allowed to use all the memory. Add a new kernel parameters "hugepage_prohibit_sz=" to set size for normal non-huge page reserved, and when alloc huge page, let's fail if the new allocating exceeds the limit. Signed-off-by: Kefeng Wang Signed-off-by: Peng Liu Reviewed-by: Chen Wandun Signed-off-by: Yongqiang Liu --- .../admin-guide/kernel-parameters.txt | 7 ++ mm/hugetlb.c | 73 +++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2cb6c844d751..170e25b192a1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1405,6 +1405,13 @@ hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET registers. Default set by CONFIG_HPET_MMAP_DEFAULT. + hugepage_prohibit_sz= + [HW] HugeTLB pages should not alloc when the rest of + the normal pages less than hugepage_prohibit_sz. This + setting is to make sure a system can start even when + part of physical memory is broken, admin users can + adjust this according to typical environment. + hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. If using node format, the number of pages to allocate per-node can be specified. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 68cec97bbd06..5b6a050fd6c1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1444,6 +1444,33 @@ pgoff_t hugetlb_basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } +#define HUGE_PAGE_BOOTMEM_ALLOC 0 +#define HUGE_PAGE_FRESH_ALLOC 1 + +static u64 normal_page_reserve_sz; + +static int __init early_normal_page_reserve(char *p) +{ + unsigned long long size; + + if (!p) + return 1; + + size = memparse(p, &p); + if (*p) { + pr_warn("HugeTLB: Invalid normal page reserved size\n"); + return 1; + } + + normal_page_reserve_sz = size & PAGE_MASK; + + pr_info("HugeTLB: Normal page reserved %lldMB\n", + normal_page_reserve_sz >> 20); + + return 0; +} +early_param("hugepage_prohibit_sz", early_normal_page_reserve); + static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) @@ -1491,6 +1518,45 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, return page; } +static bool __ref huge_page_limit_check(int type, size_t hsize, int nid) +{ + u64 mem_usable = 0; + char *str = NULL; + char buf[32]; + + if (!normal_page_reserve_sz) + return true; + + if (system_state > SYSTEM_SCHEDULING) + return true; + + if (normal_page_reserve_sz >= memblock_phys_mem_size()) { + mem_usable = memblock_phys_mem_size(); + str = "physical memory"; + goto out; + } + + if (type == HUGE_PAGE_BOOTMEM_ALLOC) { + mem_usable = memblock_phys_mem_size() - memblock_reserved_size(); + str = "memblock usable"; + } else if (type == HUGE_PAGE_FRESH_ALLOC) { + mem_usable = nr_free_pages() << PAGE_SHIFT; + str = "free page"; + } + + if (mem_usable < normal_page_reserve_sz + hsize) + goto out; + + return true; +out: + string_get_size(hsize, 1, STRING_UNITS_2, buf, 32); + pr_info("HugeTLB: allocating(%s) + Normal pages reserved(%lldMB) node%d exceed %s size(%lldMB)\n", + buf, normal_page_reserve_sz >> 20, + nid, str, mem_usable >> 20); + + return false; +} + /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages @@ -1501,6 +1567,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, { struct page *page; + if (!huge_page_limit_check(HUGE_PAGE_FRESH_ALLOC, huge_page_size(h), nid)) + return NULL; + if (hstate_is_gigantic(h)) page = alloc_gigantic_page(h, gfp_mask, nid, nmask); else @@ -2251,6 +2320,10 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) if (nid != NUMA_NO_NODE && nid >= nr_online_nodes) return 0; + + if (!huge_page_limit_check(HUGE_PAGE_BOOTMEM_ALLOC, huge_page_size(h), nid)) + return 0; + /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = memblock_virt_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), -- Gitee From b253ac1c874e3fa28ffbd41a724420a784c9062d Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Thu, 24 Mar 2022 04:19:05 +0000 Subject: [PATCH 097/340] block-map: add __GFP_ZERO flag for alloc_page in function bio_copy_kern mainline inclusion from mainline-v5.17-rc5 commit cc8f7fe1f5eab010191aa4570f27641876fa1267 category: bugfix bugzilla: 186474, https://gitee.com/openeuler/kernel/issues/I4Z2LA CVE: CVE-2022-0494 -------------------------------- Add __GFP_ZERO flag for alloc_page in function bio_copy_kern to initialize the buffer of a bio. Signed-off-by: Haimin Zhang Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220216084038.15635-1-tcs.kernel@gmail.com Signed-off-by: Jens Axboe Conflict: commit ce288e053568 ("block: remove BLK_BOUNCE_ISA support") is not backported. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index b50d3b59c79b..6457cbfa70cc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1528,7 +1528,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(q->bounce_gfp | __GFP_ZERO | gfp_mask); if (!page) goto cleanup; -- Gitee From 11e2bfddce4a40a3c996f1eef7e563c03b16839c Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Sat, 26 Mar 2022 09:38:38 +0800 Subject: [PATCH 098/340] arm64/mpam: fix __mpam_device_create() section mismatch error ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I49RB2 CVE: NA --------------------------------------------------- Fix modpost Section mismatch error in __mpam_device_create() and others. These warnings will occur in high version gcc, for example 10.1.0. [...] WARNING: vmlinux.o(.text+0x2ed88): Section mismatch in reference from the function __mpam_device_create() to the function .init.text:mpam_device_alloc() The function __mpam_device_create() references the function __init mpam_device_alloc(). This is often because __mpam_device_create lacks a __init annotation or the annotation of mpam_device_alloc is wrong. WARNING: vmlinux.o(.text.unlikely+0xa5c): Section mismatch in reference from the function mpam_resctrl_init() to the function .init.text:mpam_init_padding() The function mpam_resctrl_init() references the function __init mpam_init_padding(). This is often because mpam_resctrl_init lacks a __init annotation or the annotation of mpam_init_padding is wrong. WARNING: vmlinux.o(.text.unlikely+0x5a9c): Section mismatch in reference from the function resctrl_group_init() to the function .init.text:resctrl_group_setup_root() The function resctrl_group_init() references the function __init resctrl_group_setup_root(). This is often because resctrl_group_init lacks a __init annotation or the annotation of resctrl_group_setup_root is wrong. [...] Fixes: c5e27c395a70 ("arm64/mpam: remove __init macro to support driver probe") Signed-off-by: Xingang Wang Signed-off-by: Wang ShaoBo Reviewed-by: Cheng Jian Signed-off-by: Laibin Qiu --- arch/arm64/kernel/mpam/mpam_device.c | 8 ++++---- arch/arm64/kernel/mpam/mpam_resctrl.c | 2 +- fs/resctrlfs.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c index 4e882e81cf41..5d34751f8c3e 100644 --- a/arch/arm64/kernel/mpam/mpam_device.c +++ b/arch/arm64/kernel/mpam/mpam_device.c @@ -621,7 +621,7 @@ static void mpam_failed(struct work_struct *work) mutex_unlock(&mpam_cpuhp_lock); } -static struct mpam_device * __init +static struct mpam_device * mpam_device_alloc(struct mpam_component *comp) { struct mpam_device *dev; @@ -656,7 +656,7 @@ static void mpam_devices_destroy(struct mpam_component *comp) } } -static struct mpam_component * __init mpam_component_alloc(int id) +static struct mpam_component *mpam_component_alloc(int id) { struct mpam_component *comp; @@ -694,7 +694,7 @@ struct mpam_component *mpam_component_get(struct mpam_class *class, int id, return comp; } -static struct mpam_class * __init mpam_class_alloc(u8 level_idx, +static struct mpam_class *mpam_class_alloc(u8 level_idx, enum mpam_class_types type) { struct mpam_class *class; @@ -733,7 +733,7 @@ static void mpam_class_destroy(struct mpam_class *class) } } -static struct mpam_class * __init mpam_class_get(u8 level_idx, +static struct mpam_class *mpam_class_get(u8 level_idx, enum mpam_class_types type, bool alloc) { diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index fe2dcf92100f..183c83d4274f 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -1135,7 +1135,7 @@ void closid_free(int closid) * Choose a width for the resource name and resource data based on the * resource that has widest name and cbm. */ -static __init void mpam_init_padding(void) +static void mpam_init_padding(void) { int cl; struct mpam_resctrl_res *res; diff --git a/fs/resctrlfs.c b/fs/resctrlfs.c index ea9df7d77b95..0e6753012140 100644 --- a/fs/resctrlfs.c +++ b/fs/resctrlfs.c @@ -989,7 +989,7 @@ static void resctrl_group_default_init(struct resctrl_group *r) r->type = RDTCTRL_GROUP; } -static int __init resctrl_group_setup_root(void) +static int resctrl_group_setup_root(void) { int ret; -- Gitee From 33dea90eb1047c6648ca441e15312996efd6e3fe Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Sat, 26 Mar 2022 09:38:39 +0800 Subject: [PATCH 099/340] arm64/mpam: refactor device tree structure to support multiple devices ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I49RB2 CVE: NA --------------------------------------------------- The process of MPAM device tree initialization is like this: arm_mpam_device_probe() // driver probe mpam_discovery_start() // start discover mpam devices [...] // find and add mpam devices mpam_discovery_complete() // trigger mpam_enable When there are multiple mpam device nodes, the driver probe procedure will execute more than once. However, the mpam_discovery_start() and mpam_discovery_complete() should only run once. Besides, the start should run first, and the complete should run after all devices added. So we reorganize the device tree structure, so that there will be only one mpam device parent nodes, and the probe procedure will only run once. We add the child node to represent the mpam devices, and traverse and add all mpam devices in the middle procedure of driver probe. Signed-off-by: Xingang Wang Signed-off-by: Wang ShaoBo Reviewed-by: Cheng Jian Signed-off-by: Laibin Qiu --- arch/arm64/kernel/mpam/mpam_device.c | 59 +++++++++++++++------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c index 5d34751f8c3e..6455c69f132f 100644 --- a/arch/arm64/kernel/mpam/mpam_device.c +++ b/arch/arm64/kernel/mpam/mpam_device.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mpam_resource.h" #include "mpam_device.h" @@ -1718,10 +1719,9 @@ static const struct of_device_id arm_mpam_of_device_ids[] = { { } }; -static int of_mpam_parse_irq(struct platform_device *pdev, +static int of_mpam_parse_irq(struct device_node *node, struct mpam_device *dev) { - struct device_node *node = pdev->dev.of_node; u32 overflow_interrupt, overflow_flags; u32 error_interrupt, error_interrupt_flags; @@ -1736,12 +1736,12 @@ static int of_mpam_parse_irq(struct platform_device *pdev, error_interrupt, error_interrupt_flags); } -static int of_mpam_parse_cache(struct platform_device *pdev) +static int of_mpam_parse_cache(struct platform_device *pdev, + struct device_node *node) { struct mpam_device *dev; - struct device_node *node = pdev->dev.of_node; int cache_level, cache_id; - struct resource *res; + u64 reg_value[2]; if (of_property_read_u32(node, "cache-level", &cache_level)) { dev_err(&pdev->dev, "missing cache level property\n"); @@ -1754,27 +1754,27 @@ static int of_mpam_parse_cache(struct platform_device *pdev) } /* Base address */ - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { + if (of_property_read_u64_array(node, "reg", reg_value, 2)) { dev_err(&pdev->dev, "missing io resource property\n"); return -EINVAL; } - dev = mpam_device_create_cache(cache_level, cache_id, NULL, res->start); + dev = mpam_device_create_cache(cache_level, cache_id, NULL, + reg_value[0]); if (IS_ERR(dev)) { dev_err(&pdev->dev, "Failed to create cache node\n"); return -EINVAL; } - return of_mpam_parse_irq(pdev, dev); + return of_mpam_parse_irq(node, dev); } -static int of_mpam_parse_memory(struct platform_device *pdev) +static int of_mpam_parse_memory(struct platform_device *pdev, + struct device_node *node) { struct mpam_device *dev; - struct device_node *node = pdev->dev.of_node; int numa_id; - struct resource *res; + u64 reg_value[2]; if (of_property_read_u32(node, "numa-node-id", &numa_id)) { dev_err(&pdev->dev, "missing numa node id property\n"); @@ -1782,40 +1782,35 @@ static int of_mpam_parse_memory(struct platform_device *pdev) } /* Base address */ - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { + if (of_property_read_u64_array(node, "reg", reg_value, 2)) { dev_err(&pdev->dev, "missing io resource property\n"); return -EINVAL; } - dev = mpam_device_create_memory(numa_id, res->start); + dev = mpam_device_create_memory(numa_id, reg_value[0]); if (IS_ERR(dev)) { dev_err(&pdev->dev, "Failed to create memory node\n"); return -EINVAL; } - return of_mpam_parse_irq(pdev, dev); + return of_mpam_parse_irq(node, dev); } -static int of_mpam_parse(struct platform_device *pdev) +static int of_mpam_add_child(struct platform_device *pdev, + struct device_node *node) { - struct device *dev = &pdev->dev; - struct device_node *node = dev->of_node; enum mpam_class_types type; - if (!node || !of_match_node(arm_mpam_of_device_ids, pdev->dev.of_node)) - return -EINVAL; - - if (of_property_read_u32(dev->of_node, "type", &type)) { - dev_err(dev, "missing type property\n"); + if (of_property_read_u32(node, "type", &type)) { + dev_err(&pdev->dev, "missing type property\n"); return -EINVAL; } switch (type) { case MPAM_CLASS_CACHE: - return of_mpam_parse_cache(pdev); + return of_mpam_parse_cache(pdev, node); case MPAM_CLASS_MEMORY: - return of_mpam_parse_memory(pdev); + return of_mpam_parse_memory(pdev, node); default: pr_warn_once("Unknown node type %u.\n", type); return -EINVAL; @@ -1833,6 +1828,9 @@ static int of_mpam_parse(struct platform_device *pdev) static int arm_mpam_device_probe(struct platform_device *pdev) { int ret; + struct device *dev = &pdev->dev; + struct device_node *node = dev->of_node; + struct device_node *child = NULL; if (!cpus_have_const_cap(ARM64_HAS_MPAM)) return 0; @@ -1840,11 +1838,18 @@ static int arm_mpam_device_probe(struct platform_device *pdev) if (!acpi_disabled || mpam_enabled != MPAM_ENABLE_OF) return 0; + if (!node || !of_match_node(arm_mpam_of_device_ids, pdev->dev.of_node)) + return -EINVAL; + ret = mpam_discovery_start(); if (ret) return ret; - ret = of_mpam_parse(pdev); + for_each_available_child_of_node(node, child) { + ret = of_mpam_add_child(pdev, child); + if (ret) + break; + } if (ret) { mpam_discovery_failed(); -- Gitee From 880b19479afc21334003024addc19e8a8fedf735 Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Sat, 26 Mar 2022 09:38:40 +0800 Subject: [PATCH 100/340] dt-bindings: mpam: refactor device tree node structure arm64/mpam: refactor device tree structure to support multiple devices ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I49RB2 CVE: NA --------------------------------------------------- To support multiple mpam device nodes, all nodes should be organized as child of the same parent nodes. This makes sure that the mpam discovery start and complete procedure in the right execution order. Add modification in the devicetree documentation to record this. Signed-off-by: Xingang Wang Signed-off-by: Wang ShaoBo Reviewed-by: Cheng Jian Signed-off-by: Laibin Qiu --- .../devicetree/bindings/arm/arm,mpam.txt | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/Documentation/devicetree/bindings/arm/arm,mpam.txt b/Documentation/devicetree/bindings/arm/arm,mpam.txt index 65c1e6809685..e9ba09bb3159 100644 --- a/Documentation/devicetree/bindings/arm/arm,mpam.txt +++ b/Documentation/devicetree/bindings/arm/arm,mpam.txt @@ -28,27 +28,30 @@ and Monitoring (MPAM), for Armv8-A", MPAM interrupts(section 8.8). Example: -mpam_memory0 { +mpam { compatible = "arm,mpam"; - reg = <0x0 0x10000000 0x0 0x10000>; - type = <2>; /* memory type */ - numa-node-id = <0>; - overflow-interrupt = <0>; - overflow-flags = <0>; - error-interrupt = <0>; - error-interrupt-flags = <0>; - not-ready-max = <0>; -}; -mpam_cache0 { - compatible = "arm,mpam"; - reg = <0x0 0x20000000 0x0 0x10000>; - type = <1>; /* cache type */ - cache-id = <0>; - cache-level = <3>; - overflow-interrupt = <0>; - overflow-flags = <0>; - error-interrupt = <0>; - error-interrupt-flags = <0>; - not-ready-max = <0>; + mpam_memory0 { + reg = <0x0 0x10000000 0x0 0x10000>; + type = <2>; /* memory type */ + numa-node-id = <0>; + overflow-interrupt = <0>; + overflow-flags = <0>; + error-interrupt = <0>; + error-interrupt-flags = <0>; + not-ready-max = <0>; + }; + + mpam_cache0 { + reg = <0x0 0x20000000 0x0 0x10000>; + type = <1>; /* cache type */ + cache-id = <0>; + cache-level = <3>; + overflow-interrupt = <0>; + overflow-flags = <0>; + error-interrupt = <0>; + error-interrupt-flags = <0>; + not-ready-max = <0>; + }; + }; -- Gitee From e79aa56b54e9bf6a2f314961c9872f71634f040c Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Sat, 26 Mar 2022 09:38:41 +0800 Subject: [PATCH 101/340] arm64/mpam: realign step entry when traversing rmid_transform hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SE03 CVE: NA --------------------------------------------------- This makes step entry aligned with step_size*step_cnt but not step_size, and check for alignment before traversing rmid_transform. When modifying rmid with a value not aligned with step_size*step_cnt, for_each_rmid_transform_point_step_from might miss next step point if it has been occupied in case step_cnt or step_size not equals to 1, which will cause the actual allocated rmid to be inconsistent with the expected one. Fixes: b47b9a817da1 ("arm64/mpam: rmid: refine allocation and release process") Signed-off-by: Wang ShaoBo Reviewed-by: Cheng Jian Signed-off-by: Laibin Qiu --- arch/arm64/kernel/mpam/mpam_resctrl.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index 183c83d4274f..60d3d8706a38 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -839,7 +839,8 @@ static inline unsigned long **__rmid_remap_bmp(u32 col) #define __step_xy_initialize(step, x, y, from) \ (x = from, step = 1, y = 0) #define __step_align(from) \ - (!(from % rmid_remap_matrix.step_size)) + (!(from % (rmid_remap_matrix.step_size * \ + rmid_remap_matrix.step_cnt))) #define __step_overflow(step) \ (__xy_overflow(x, y) || \ (step > rmid_remap_matrix.step_cnt)) @@ -913,7 +914,7 @@ static int is_rmid_remap_bmp_full(unsigned long *bmp) bitmap_full(bmp, rmid_remap_matrix.rows)); } -static int rmid_remap_bmp_find_first_avail_partid(int partid) +static int rmid_remap_bmp_find_step_entry(int partid) { int x, y; unsigned long **bmp; @@ -922,17 +923,18 @@ static int rmid_remap_bmp_find_first_avail_partid(int partid) rmid_remap_matrix.cols) return 0; + /* step entry should be non-occupied and aligned */ bmp = __rmid_remap_bmp(partid); - if (bmp && !is_rmid_remap_bmp_occ(*bmp)) - return partid; + if (bmp) + return (is_rmid_remap_bmp_occ(*bmp) || + !__step_align(partid)) ? -ENOSPC : partid; for_each_rmid_transform_point_from(bmp, x, y, 0) { /* * do not waste partid resource, start - * from step_size aligned position. + * from step aligned position. */ - if (!is_rmid_remap_bmp_occ(*bmp) && - (x % rmid_remap_matrix.step_size) == 0) + if (__step_align(x) && !is_rmid_remap_bmp_occ(*bmp)) return x; } @@ -1026,8 +1028,8 @@ static int __rmid_alloc(int partid, int pmg) if (pmg >= 0) checkpmg = true; - /* traverse from first non-occupied and step_size aligned entry */ - ret = rmid_remap_bmp_find_first_avail_partid(partid); + /* traverse from first non-occupied and step-aligned entry */ + ret = rmid_remap_bmp_find_step_entry(partid); if (ret < 0) goto out; partid = ret; -- Gitee From 2b491a2e84f8f1b5c8cbd4715ea52129ebd88ac4 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Sat, 26 Mar 2022 10:50:09 +0000 Subject: [PATCH 102/340] livepatch/arm64: Fix incorrect endian conversion when long jump hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4ZIB2 CVE: NA -------------------------------- Kernel panic happened on 'arm64 big endian' board after calling function that has been live-patched. It can be reproduced as follows: 1. Insert 'livepatch-sample.ko' to patch kernel function 'cmdline_proc_show'; 2. Enable patch by execute: echo 1 > /sys/kernel/livepatch/livepatch-sample/enabled 3. Call 'cmdline_proc_show' by execute: cat /proc/cmdline 4. Then we get following panic logs: > kernel BUG at arch/arm64/kernel/traps.c:408! > Internal error: Oops - BUG: 0 [#1] SMP > Modules linked in: dump_mem(OE) livepatch_cmdline1(OEK) hi1382_gmac(OE) > [last unloaded: dump_mem] > CPU: 3 PID: 1752 Comm: cat Session: 0 Tainted: G OE K > 5.10.0+ #2 > Hardware name: Hisilicon PhosphorHi1382 (DT) > pstate: 00000005 (nzcv daif -PAN -UAO -TCO BTYPE=--) > pc : do_undefinstr+0x23c/0x2b4 > lr : do_undefinstr+0x5c/0x2b4 > sp : ffffffc010ac3a80 > x29: ffffffc010ac3a80 x28: ffffff82eb0a8000 > x27: 0000000000000000 x26: 0000000000000001 > x25: 0000000000000000 x24: 0000000000001000 > x23: 0000000000000000 x22: ffffffd0e0f16000 > x21: ffffffd0e0ae7000 x20: ffffffc010ac3b00 > x19: 0000000000021fd6 x18: ffffffd0e04aad94 > x17: 0000000000000000 x16: 0000000000000000 > x15: ffffffd0e04b519c x14: 0000000000000000 > x13: 0000000000000000 x12: 0000000000000000 > x11: 0000000000000000 x10: 0000000000000000 > x9 : 0000000000000000 x8 : 0000000000000000 > x7 : 0000000000000000 x6 : ffffffd0e0f16100 > x5 : 0000000000000000 x4 : 00000000d5300000 > x3 : 0000000000000000 x2 : ffffffd0e0f160f0 > x1 : ffffffd0e0f16103 x0 : 0000000000000005 > Call trace: > do_undefinstr+0x23c/0x2b4 > el1_undef+0x2c/0x44 > el1_sync_handler+0xa4/0xb0 > el1_sync+0x74/0x100 > cmdline_proc_show+0xc/0x44 > proc_reg_read_iter+0xb0/0xc4 > new_sync_read+0x10c/0x15c > vfs_read+0x144/0x18c > ksys_read+0x78/0xe8 > __arm64_sys_read+0x24/0x30 We compare first 6 instructions of 'cmdline_proc_show' before and after patch (see below). There are 4 instructions modified, so this is case that offset between old and new function is out of 128M. And we found that instruction at 'cmdline_proc_show+0xc' seems incorrect (it expects to be '00021fd6'). origin: patched: -------- -------- fd7bbea9 929ff7f0 21d500f0 f2a91b30 fd030091 f2d00010 211040f9 d61f0200 <-- cmdline_proc_show+0xc (expect is '00021fd6') f30b00f9 f30b00f9 f30300aa f30300aa It is caused by an incorrect big-to-little endian conversion, and we correct it. Fixes: 5aa9a1a3488b ("livepatch/arm64: support livepatch without ftrace") Signed-off-by: Zheng Yejian Reviewed-by: Kuohai Xu Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/livepatch.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/livepatch.c b/arch/arm64/kernel/livepatch.c index d966d4767cb7..235e6f8b6719 100644 --- a/arch/arm64/kernel/livepatch.c +++ b/arch/arm64/kernel/livepatch.c @@ -251,10 +251,10 @@ int arch_klp_patch_func(struct klp_func *func) if (aarch64_insn_patch_text_nosync((void *)pc, insn)) goto ERR_OUT; } else { - insns[0] = cpu_to_le32(0x92800010 | (((~new_addr) & 0xffff)) << 5); - insns[1] = cpu_to_le32(0xf2a00010 | (((new_addr >> 16) & 0xffff)) << 5); - insns[2] = cpu_to_le32(0xf2c00010 | (((new_addr >> 32) & 0xffff)) << 5); - insns[3] = cpu_to_le32(0xd61f0200); + insns[0] = 0x92800010 | (((~new_addr) & 0xffff)) << 5; + insns[1] = 0xf2a00010 | (((new_addr >> 16) & 0xffff)) << 5; + insns[2] = 0xf2c00010 | (((new_addr >> 32) & 0xffff)) << 5; + insns[3] = 0xd61f0200; for (i = 0; i < LJMP_INSN_SIZE; i++) { if (aarch64_insn_patch_text_nosync(((u32 *)pc) + i, insns[i])) goto ERR_OUT; @@ -328,10 +328,10 @@ void arch_klp_unpatch_func(struct klp_func *func) aarch64_insn_patch_text_nosync((void *)pc, insn); } else { - insns[0] = cpu_to_le32(0x92800010 | (((~new_addr) & 0xffff)) << 5); - insns[1] = cpu_to_le32(0xf2a00010 | (((new_addr >> 16) & 0xffff)) << 5); - insns[2] = cpu_to_le32(0xf2c00010 | (((new_addr >> 32) & 0xffff)) << 5); - insns[3] = cpu_to_le32(0xd61f0200); + insns[0] = 0x92800010 | (((~new_addr) & 0xffff)) << 5; + insns[1] = 0xf2a00010 | (((new_addr >> 16) & 0xffff)) << 5; + insns[2] = 0xf2c00010 | (((new_addr >> 32) & 0xffff)) << 5; + insns[3] = 0xd61f0200; for (i = 0; i < LJMP_INSN_SIZE; i++) aarch64_insn_patch_text_nosync(((u32 *)pc) + i, insns[i]); -- Gitee From 875ffd41499ee5a3512da409cbd4c2ffd32b3cfa Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 30 Mar 2022 03:33:02 +0000 Subject: [PATCH 103/340] mm: Do limit checking after memory allocation for memory reliable hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Previous limit checking are done before real memory allocation. This will lead to some steps in __alloc_pages_slowpath() unreached(kswapd, direct compact and so on). Now limit checking are done in the end of __alloc_pages_nodemask(). Pages will be released if this memory allocation is stopped by limit checking. Memory allocation will fallback to movable zone if one of the following conditions are met: - memory reliable fallback is enabled - global init process Memory allocation with __GFP_NOFAIL will not check any limit. If memory reliable fallback is disabled and gfp flag does not contains any of the following flags: - __GFP_NORETRY - __GFP_RETRY_MAYFAIL - __GFP_THISNODE Or the following conditions are false - current->flags & PF_DUMPCORE - order > PAGE_ALLOC_COSTLY_ORDER Oom will occur to release some memory. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/page_alloc.c | 194 +++++++++++++++++++----------------------------- 1 file changed, 76 insertions(+), 118 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f4d4716b9049..fa1cdeba5293 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3671,60 +3671,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, return page; } -#ifdef CONFIG_MEMORY_RELIABLE -static inline void reliable_fb_find_zone(gfp_t gfp_mask, - struct alloc_context *ac) -{ - if (!reliable_allow_fb_enabled()) - return; - - /* dst node don't have zone we want, fallback here */ - if ((gfp_mask & __GFP_THISNODE) && (ac->high_zoneidx == ZONE_NORMAL) && - (gfp_mask & ___GFP_RELIABILITY)) { - ac->high_zoneidx = gfp_zone(gfp_mask & ~___GFP_RELIABILITY); - ac->preferred_zoneref = first_zones_zonelist( - ac->zonelist, ac->high_zoneidx, ac->nodemask); - } - - return; -} - -static inline struct page * -reliable_fb_before_oom(gfp_t gfp_mask, int order, - const struct alloc_context *ac) -{ - if (!reliable_allow_fb_enabled()) - return NULL; - - /* key user process alloc mem from movable zone to avoid oom */ - if ((ac->high_zoneidx == ZONE_NORMAL) && - (gfp_mask & ___GFP_RELIABILITY)) { - struct alloc_context tmp_ac = *ac; - - tmp_ac.high_zoneidx = ZONE_MOVABLE; - tmp_ac.preferred_zoneref = first_zones_zonelist( - ac->zonelist, ZONE_MOVABLE, ac->nodemask); - return get_page_from_freelist( - (gfp_mask | __GFP_HARDWALL) & ~__GFP_DIRECT_RECLAIM, - order, ALLOC_WMARK_HIGH | ALLOC_CPUSET, &tmp_ac); - } - - return NULL; -} -#else -static inline void reliable_fb_find_zone(gfp_t gfp_mask, - struct alloc_context *ac) -{ - return; -} - -static inline struct page *reliable_fb_before_oom(gfp_t gfp_mask, int order, - const struct alloc_context *ac) -{ - return NULL; -} -#endif - static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -3763,10 +3709,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - page = reliable_fb_before_oom(gfp_mask, order, ac); - if (page) - goto out; - /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) goto out; @@ -4374,12 +4316,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); - if (!ac->preferred_zoneref->zone) { - reliable_fb_find_zone(gfp_mask, ac); - - if (!ac->preferred_zoneref->zone) - goto nopage; - } + if (!ac->preferred_zoneref->zone) + goto nopage; if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, gfp_mask, ac); @@ -4638,74 +4576,94 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) ac->high_zoneidx, ac->nodemask); } -/* - * return false means this allocation is limit by reliable user limit and - * this will lead to pagefault_out_of_memory() - */ -static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) +static inline void prepare_before_alloc(gfp_t *gfp_mask) { gfp_t gfp_ori = *gfp_mask; + bool zone_movable; + *gfp_mask &= gfp_allowed_mask; if (!mem_reliable_is_enabled()) - return true; + return; - if (*gfp_mask & __GFP_NOFAIL) - return true; + /* + * memory reliable only handle memory allocation from movable zone + * (force alloc from non-movable zone or force alloc from movable + * zone) to get total isolation. + */ + zone_movable = gfp_zone(*gfp_mask) == ZONE_MOVABLE; + if (!zone_movable) + return; if (gfp_ori & ___GFP_RELIABILITY) { - if (!(gfp_ori & __GFP_HIGHMEM) || !(gfp_ori & __GFP_MOVABLE)) - return true; + *gfp_mask |= ___GFP_RELIABILITY; + return; + } - if (mem_reliable_watermark_ok(1 << order)) { - *gfp_mask |= ___GFP_RELIABILITY; - return true; - } + if (is_global_init(current) || (current->flags & PF_RELIABLE)) + *gfp_mask |= ___GFP_RELIABILITY; +} - if (reliable_allow_fb_enabled()) - return true; +/* + * return true means memory allocation need retry and flag ___GFP_RELIABILITY + * must be cleared. + */ +static inline bool check_after_alloc(gfp_t *gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask, + struct page **_page) +{ + if (!mem_reliable_is_enabled()) + return false; + if (!(*gfp_mask & ___GFP_RELIABILITY)) return false; - } - /* - * Init tasks will alloc memory from non-mirrored region if their - * allocation trigger task_reliable_limit - */ - if (is_global_init(current)) { - if (!mem_reliable_counter_initialized()) { - *gfp_mask |= ___GFP_RELIABILITY; - return true; - } + if (!*_page) + goto out_retry; - if (reliable_mem_limit_check(1 << order) && - mem_reliable_watermark_ok(1 << order)) - *gfp_mask |= ___GFP_RELIABILITY; - return true; - } + if (*gfp_mask & __GFP_NOFAIL) + goto out; - /* - * This only check task_reliable_limit without ___GFP_RELIABILITY - * or this process is global init. - * For kernel internal mechanism(hugepaged collapse and others) - * If they alloc memory for user and obey task_reliable_limit, they - * need to check this limit before allocing pages. - */ - if ((current->flags & PF_RELIABLE) && (gfp_ori & __GFP_HIGHMEM) && - (gfp_ori & __GFP_MOVABLE)) { - if (reliable_mem_limit_check(1 << order) && - mem_reliable_watermark_ok(1 << order)) { - *gfp_mask |= ___GFP_RELIABILITY; - return true; - } + /* check water mark, reserver mirrored mem for kernel */ + if (!mem_reliable_watermark_ok(1 << order)) + goto out_free_page; - if (reliable_allow_fb_enabled()) - return true; + /* percpu counter is not initialized, ignore limit check */ + if (!mem_reliable_counter_initialized()) + goto out; - return false; + /* spcial user task, systemd is limited by task_reliable_limit */ + if (((current->flags & PF_RELIABLE) || is_global_init(current)) && + !reliable_mem_limit_check(1 << order)) + goto out_free_page; + + goto out; + +out_free_page: + __free_pages(*_page, order); + *_page = NULL; + +out_retry: + if (reliable_allow_fb_enabled() || is_global_init(current)) { + *gfp_mask &= ~___GFP_RELIABILITY; + return true; } - return true; + if (*gfp_mask & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + goto out; + + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* oom here */ + mem_reliable_out_of_memory(*gfp_mask, order, preferred_nid, + nodemask); +out: + return false; } /* @@ -4729,12 +4687,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, return NULL; } - if (!prepare_before_alloc(&gfp_mask, order)) { - mem_reliable_out_of_memory(gfp_mask, order, preferred_nid, - nodemask); - goto out; - } + prepare_before_alloc(&gfp_mask); +retry: alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; @@ -4771,6 +4726,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, page = NULL; } + if (check_after_alloc(&gfp_mask, order, preferred_nid, nodemask, &page)) + goto retry; + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; -- Gitee From a30ba5b645cf928245fa7db7705afda57b3b3d72 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 30 Mar 2022 03:33:03 +0000 Subject: [PATCH 104/340] mm: Disable watermark check if reliable fallback is disabled hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- If memory reliable watermark is far greater than zone's default watermark, reliable memory allocation will trigger oom without wake up kswapd to release pagecache. With this patch, memory reliable watermark will be disabled if memory fallback is disabled. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- include/linux/mem_reliable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index c4f954340ea7..8340a24fe76d 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -124,6 +124,9 @@ static inline bool mem_reliable_watermark_ok(int nr_page) long sum = 0; int cpu; + if (!reliable_allow_fb_enabled()) + return true; + for_each_possible_cpu(cpu) sum += per_cpu(nr_reliable_buddy_pages, cpu); -- Gitee From 368d710d7f327b6512cbc90846b33cbf9934a579 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 30 Mar 2022 03:33:04 +0000 Subject: [PATCH 105/340] mm: Fallback to non-mirrored region below low watermark hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Commit 45bd608ef89e ("mm: Introduce watermark check for memory reliable") introduce watermark to reserve memory in mirrored region for kernel usage. But this value does not interact well with kswapd and this will lead to fallback to non-mirrored region without release pagecache in mirrored region. With this patch, watermark is set to zero to avoid this problem. Memory allocation with ___GFP_RELIABILITY will fallback to non-mirrored region if zone's low watermark is reached and kswapd will be awakened at this time. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/mem_reliable.c | 2 +- mm/page_alloc.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 5505577d3784..6d4ab4bee3d5 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -11,7 +11,7 @@ #include #include -#define MEM_RELIABLE_RESERVE_MIN (256UL << 20) +#define MEM_RELIABLE_RESERVE_MIN 0 enum mem_reliable_types { MEM_RELIABLE_ALL, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fa1cdeba5293..9d4f75235420 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4271,6 +4271,29 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) return false; } +#ifdef CONFIG_MEMORY_RELIABLE +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return; + + if (gfp_mask & __GFP_NOFAIL) + return; + + if ((ac->high_zoneidx == ZONE_NORMAL) && + (gfp_mask & ___GFP_RELIABILITY)) { + ac->high_zoneidx = gfp_zone(gfp_mask & ~___GFP_RELIABILITY); + ac->preferred_zoneref = first_zones_zonelist( + ac->zonelist, ac->high_zoneidx, ac->nodemask); + return; + } +} +#else +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) {} +#endif + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -4322,6 +4345,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, gfp_mask, ac); + mem_reliable_fallback_slowpath(gfp_mask, ac); + /* * The adjusted alloc_flags might result in immediate success, so try * that first -- Gitee From 93dfedd141b0d49150ce32ad88ca93fc1f614f47 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Sat, 2 Apr 2022 07:03:41 +0000 Subject: [PATCH 106/340] ima: Don't ignore errors from crypto_shash_update() stable inclusion from stable-v4.19.153 commit c470dc530c9ee6ef4b22fed19c77e20c745564e1 category: bugfix bugzilla: 83782, https://gitee.com/openeuler/kernel/issues/I5047U CVE: NA ----------------------------------------------------------------- commit 60386b854008adc951c470067f90a2d85b5d520f upstream. Errors returned by crypto_shash_update() are not checked in ima_calc_boot_aggregate_tfm() and thus can be overwritten at the next iteration of the loop. This patch adds a check after calling crypto_shash_update() and returns immediately if the result is not zero. Cc: stable@vger.kernel.org Fixes: 3323eec921efd ("integrity: IMA as an integrity service provider") Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Weiyang Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- security/integrity/ima/ima_crypto.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index d9f4b4c84519..1289757e73ea 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -683,6 +683,8 @@ static int __init ima_calc_boot_aggregate_tfm(char *digest, ima_pcrread(i, pcr_i); /* now accumulate with current aggregate */ rc = crypto_shash_update(shash, pcr_i, TPM_DIGEST_SIZE); + if (rc != 0) + return rc; } if (!rc) crypto_shash_final(shash, digest); -- Gitee From 192b51c29805e6e4921a698a828225d518cc27c0 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Sat, 2 Apr 2022 07:03:42 +0000 Subject: [PATCH 107/340] evm: Check size of security.evm before using it stable inclusion from stable-v4.19.155 commit 05f703b07727c0eb81f487143b583d5f8561d900 category: bugfix bugzilla: 83797, https://gitee.com/openeuler/kernel/issues/I5047U CVE: NA ----------------------------------------------------------------- commit 455b6c9112eff8d249e32ba165742085678a80a4 upstream. This patch checks the size for the EVM_IMA_XATTR_DIGSIG and EVM_XATTR_PORTABLE_DIGSIG types to ensure that the algorithm is read from the buffer returned by vfs_getxattr_alloc(). Cc: stable@vger.kernel.org # 4.19.x Fixes: 5feeb61183dde ("evm: Allow non-SHA1 digital signatures") Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Weiyang Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- security/integrity/evm/evm_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index c0b08ac32dcf..47f878b13990 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -190,6 +190,12 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry, break; case EVM_IMA_XATTR_DIGSIG: case EVM_XATTR_PORTABLE_DIGSIG: + /* accept xattr with non-empty signature field */ + if (xattr_len <= sizeof(struct signature_v2_hdr)) { + evm_status = INTEGRITY_FAIL; + goto out; + } + hdr = (struct signature_v2_hdr *)xattr_data; digest.hdr.algo = hdr->hash_algo; rc = evm_calc_hash(dentry, xattr_name, xattr_value, -- Gitee From b7a7cb07f8f7a9b25a73a4e5c03f46cd5a29de6d Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Sat, 2 Apr 2022 07:03:43 +0000 Subject: [PATCH 108/340] ima: Call ima_calc_boot_aggregate() in ima_eventdigest_init() stable inclusion from stable-v4.19.129 commit fcb067cb457e2326c6d759e346f5f5dfef351d50 category: bugfix bugzilla: 89622, https://gitee.com/openeuler/kernel/issues/I5047U CVE: NA ----------------------------------------------------------------- [ Upstream commit 6cc7c266e5b47d3cd2b5bb7fd3aac4e6bb2dd1d2 ] If the template field 'd' is chosen and the digest to be added to the measurement entry was not calculated with SHA1 or MD5, it is recalculated with SHA1, by using the passed file descriptor. However, this cannot be done for boot_aggregate, because there is no file descriptor. This patch adds a call to ima_calc_boot_aggregate() in ima_eventdigest_init(), so that the digest can be recalculated also for the boot_aggregate entry. Cc: stable@vger.kernel.org # 3.13.x Fixes: 3ce1217d6cd5d ("ima: define template fields library and new helpers") Reported-by: Takashi Iwai Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar Signed-off-by: Sasha Levin Conflicts: security/integrity/ima/ima_crypto.c Signed-off-by: Wang Weiyang Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- security/integrity/ima/ima.h | 3 ++- security/integrity/ima/ima_crypto.c | 6 +++--- security/integrity/ima/ima_init.c | 2 +- security/integrity/ima/ima_template_lib.c | 18 ++++++++++++++++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 8d58874a8caa..2619dee3b4a5 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -56,6 +56,7 @@ extern int ima_policy_flag; extern int ima_hash_algo; extern int ima_appraise; extern struct tpm_chip *ima_tpm_chip; +extern const char boot_aggregate_name[]; /* IMA event related data */ struct ima_event_data { @@ -139,7 +140,7 @@ int ima_calc_buffer_hash(const void *buf, loff_t len, int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_desc *desc, int num_fields, struct ima_digest_data *hash); -int __init ima_calc_boot_aggregate(struct ima_digest_data *hash); +int ima_calc_boot_aggregate(struct ima_digest_data *hash); void ima_add_violation(struct file *file, const unsigned char *filename, struct integrity_iint_cache *iint, const char *op, const char *cause); diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 1289757e73ea..0b2cd7cc9b14 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -663,8 +663,8 @@ static void __init ima_pcrread(int idx, u8 *pcr) /* * Calculate the boot aggregate hash */ -static int __init ima_calc_boot_aggregate_tfm(char *digest, - struct crypto_shash *tfm) +static int ima_calc_boot_aggregate_tfm(char *digest, + struct crypto_shash *tfm) { /*to be compatible with tpm1.2 ,make sure the content is not arbitrary*/ u8 pcr_i[IMA_DIGEST_SIZE] = {0}; @@ -691,7 +691,7 @@ static int __init ima_calc_boot_aggregate_tfm(char *digest, return rc; } -int __init ima_calc_boot_aggregate(struct ima_digest_data *hash) +int ima_calc_boot_aggregate(struct ima_digest_data *hash) { struct crypto_shash *tfm; int rc; diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c index faac9ecaa0ae..a2bc4cb4482a 100644 --- a/security/integrity/ima/ima_init.c +++ b/security/integrity/ima/ima_init.c @@ -25,7 +25,7 @@ #include "ima.h" /* name for boot aggregate entry */ -static const char *boot_aggregate_name = "boot_aggregate"; +const char boot_aggregate_name[] = "boot_aggregate"; struct tpm_chip *ima_tpm_chip; /* Add the boot aggregate to the IMA measurement list and extend diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index 43752002c222..48c5a1be88ac 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -284,6 +284,24 @@ int ima_eventdigest_init(struct ima_event_data *event_data, goto out; } + if ((const char *)event_data->filename == boot_aggregate_name) { + if (ima_tpm_chip) { + hash.hdr.algo = HASH_ALGO_SHA1; + result = ima_calc_boot_aggregate(&hash.hdr); + + /* algo can change depending on available PCR banks */ + if (!result && hash.hdr.algo != HASH_ALGO_SHA1) + result = -EINVAL; + + if (result < 0) + memset(&hash, 0, sizeof(hash)); + } + + cur_digest = hash.hdr.digest; + cur_digestsize = hash_digest_size[HASH_ALGO_SHA1]; + goto out; + } + if (!event_data->file) /* missing info to re-calculate the digest */ return -EINVAL; -- Gitee From d073934a1ceae819057c8e1a98d384cb506b1806 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Sat, 2 Apr 2022 07:03:44 +0000 Subject: [PATCH 109/340] ima: Remove __init annotation from ima_pcrread() stable inclusion from stable-v4.19.169 commit de581e41716795ce93506f3e5b0200048aa4439c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5047U CVE: NA ----------------------------------------------------------------- commit 8b8c704d913b0fe490af370631a4200e26334ec0 upstream. Commit 6cc7c266e5b4 ("ima: Call ima_calc_boot_aggregate() in ima_eventdigest_init()") added a call to ima_calc_boot_aggregate() so that the digest can be recalculated for the boot_aggregate measurement entry if the 'd' template field has been requested. For the 'd' field, only SHA1 and MD5 digests are accepted. Given that ima_eventdigest_init() does not have the __init annotation, all functions called should not have it. This patch removes __init from ima_pcrread(). Cc: stable@vger.kernel.org Fixes: 6cc7c266e5b4 ("ima: Call ima_calc_boot_aggregate() in ima_eventdigest_init()") Reported-by: Linus Torvalds Signed-off-by: Roberto Sassu Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Weiyang Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- security/integrity/ima/ima_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 0b2cd7cc9b14..9f969dc8654d 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -651,7 +651,7 @@ int ima_calc_buffer_hash(const void *buf, loff_t len, return calc_buffer_shash(buf, len, hash); } -static void __init ima_pcrread(int idx, u8 *pcr) +static void ima_pcrread(int idx, u8 *pcr) { if (!ima_tpm_chip) return; -- Gitee From 458e2478b8a6afeb6ea9cf6587ae2db89412f5f0 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Sat, 2 Apr 2022 07:03:45 +0000 Subject: [PATCH 110/340] ima: Set file->f_mode instead of file->f_flags in ima_calc_file_hash() stable inclusion from stable-v4.19.125 commit 904de138bae903a6e377322e64312e22a84f2140 category: bugfix bugzilla: 91657, https://gitee.com/openeuler/kernel/issues/I5047U CVE: NA ----------------------------------------------------------------- [ Upstream commit 0014cc04e8ec077dc482f00c87dfd949cfe2b98f ] Commit a408e4a86b36 ("ima: open a new file instance if no read permissions") tries to create a new file descriptor to calculate a file digest if the file has not been opened with O_RDONLY flag. However, if a new file descriptor cannot be obtained, it sets the FMODE_READ flag to file->f_flags instead of file->f_mode. This patch fixes this issue by replacing f_flags with f_mode as it was before that commit. Cc: stable@vger.kernel.org # 4.20.x Fixes: a408e4a86b36 ("ima: open a new file instance if no read permissions") Signed-off-by: Roberto Sassu Reviewed-by: Goldwyn Rodrigues Signed-off-by: Mimi Zohar Signed-off-by: Sasha Levin Signed-off-by: Wang Weiyang Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- security/integrity/ima/ima_crypto.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 9f969dc8654d..072a95ddc8b5 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -415,7 +415,7 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) loff_t i_size; int rc; struct file *f = file; - bool new_file_instance = false, modified_flags = false; + bool new_file_instance = false, modified_mode = false; /* * For consistency, fail file's opened with the O_DIRECT flag on @@ -435,13 +435,13 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) f = dentry_open(&file->f_path, flags, file->f_cred); if (IS_ERR(f)) { /* - * Cannot open the file again, lets modify f_flags + * Cannot open the file again, lets modify f_mode * of original and continue */ pr_info_ratelimited("Unable to reopen file for reading.\n"); f = file; - f->f_flags |= FMODE_READ; - modified_flags = true; + f->f_mode |= FMODE_READ; + modified_mode = true; } else { new_file_instance = true; } @@ -459,8 +459,8 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) out: if (new_file_instance) fput(f); - else if (modified_flags) - f->f_flags &= ~FMODE_READ; + else if (modified_mode) + f->f_mode &= ~FMODE_READ; return rc; } -- Gitee From d342874100782784237d51154f193a934e88c729 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Sat, 2 Apr 2022 07:03:46 +0000 Subject: [PATCH 111/340] ima: Don't modify file descriptor mode on the fly stable inclusion from stable-v4.19.164 commit 709ed96f6ef2b040a0ea9274b4d67ce61a095eb3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5047U CVE: NA ----------------------------------------------------------------- commit 207cdd565dfc95a0a5185263a567817b7ebf5467 upstream. Commit a408e4a86b36b ("ima: open a new file instance if no read permissions") already introduced a second open to measure a file when the original file descriptor does not allow it. However, it didn't remove the existing method of changing the mode of the original file descriptor, which is still necessary if the current process does not have enough privileges to open a new one. Changing the mode isn't really an option, as the filesystem might need to do preliminary steps to make the read possible. Thus, this patch removes the code and keeps the second open as the only option to measure a file when it is unreadable with the original file descriptor. Cc: # 4.20.x: 0014cc04e8ec0 ima: Set file->f_mode Fixes: 2fe5d6def1672 ("ima: integrity appraisal extension") Signed-off-by: Roberto Sassu Reviewed-by: Christoph Hellwig Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Weiyang Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- security/integrity/ima/ima_crypto.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 072a95ddc8b5..cda1697c8fda 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -415,7 +415,7 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) loff_t i_size; int rc; struct file *f = file; - bool new_file_instance = false, modified_mode = false; + bool new_file_instance = false; /* * For consistency, fail file's opened with the O_DIRECT flag on @@ -433,18 +433,10 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) O_TRUNC | O_CREAT | O_NOCTTY | O_EXCL); flags |= O_RDONLY; f = dentry_open(&file->f_path, flags, file->f_cred); - if (IS_ERR(f)) { - /* - * Cannot open the file again, lets modify f_mode - * of original and continue - */ - pr_info_ratelimited("Unable to reopen file for reading.\n"); - f = file; - f->f_mode |= FMODE_READ; - modified_mode = true; - } else { - new_file_instance = true; - } + if (IS_ERR(f)) + return PTR_ERR(f); + + new_file_instance = true; } i_size = i_size_read(file_inode(f)); @@ -459,8 +451,6 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) out: if (new_file_instance) fput(f); - else if (modified_mode) - f->f_mode &= ~FMODE_READ; return rc; } -- Gitee From 50cea8d3bee9c0b82409c216fb945a4c0e05e5a9 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Sat, 2 Apr 2022 07:03:47 +0000 Subject: [PATCH 112/340] ima: Fix return value of ima_write_policy() stable inclusion from stable-v4.19.125 commit 657a03ff6c97319d8e664c05a1beebd8eb15d049 category: bugfix bugzilla: 91661, https://gitee.com/openeuler/kernel/issues/I5047U CVE: NA ----------------------------------------------------------------- [ Upstream commit 2e3a34e9f409ebe83d1af7cd2f49fca7af97dfac ] This patch fixes the return value of ima_write_policy() when a new policy is directly passed to IMA and the current policy requires appraisal of the file containing the policy. Currently, if appraisal is not in ENFORCE mode, ima_write_policy() returns 0 and leads user space applications to an endless loop. Fix this issue by denying the operation regardless of the appraisal mode. Cc: stable@vger.kernel.org # 4.10.x Fixes: 19f8a84713edc ("ima: measure and appraise the IMA policy itself") Signed-off-by: Roberto Sassu Reviewed-by: Krzysztof Struczynski Signed-off-by: Mimi Zohar Signed-off-by: Sasha Levin Signed-off-by: Wang Weiyang Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- security/integrity/ima/ima_fs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index cfb8cc3b975e..604cdac63d84 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -343,8 +343,7 @@ static ssize_t ima_write_policy(struct file *file, const char __user *buf, integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, "policy_update", "signed policy required", 1, 0); - if (ima_appraise & IMA_APPRAISE_ENFORCE) - result = -EACCES; + result = -EACCES; } else { result = ima_parse_add_rule(data); } -- Gitee From 2126576ff274665fa5b98bff56e7da1e0a6335cc Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Sat, 2 Apr 2022 07:03:48 +0000 Subject: [PATCH 113/340] sr9700: sanity check for packet length mainline inclusion from mainline-v5.17-rc4 commit e9da0b56fe27206b49f39805f7dcda8a89379062 category: bugfix bugzilla: 186472, https://gitee.com/openeuler/kernel/issues/I4ZWKH CVE: CVE-2022-26966 -------------------------------- A malicious device can leak heap data to user space providing bogus frame lengths. Introduce a sanity check. Signed-off-by: Oliver Neukum Reviewed-by: Grant Grundler Signed-off-by: David S. Miller Signed-off-by: Lijun Fang Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Yongqiang Liu --- drivers/net/usb/sr9700.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c index 6ac232e52bf7..83640628c47d 100644 --- a/drivers/net/usb/sr9700.c +++ b/drivers/net/usb/sr9700.c @@ -410,7 +410,7 @@ static int sr9700_rx_fixup(struct usbnet *dev, struct sk_buff *skb) /* ignore the CRC length */ len = (skb->data[1] | (skb->data[2] << 8)) - 4; - if (len > ETH_FRAME_LEN) + if (len > ETH_FRAME_LEN || len > skb->len) return 0; /* the last packet of current skb */ -- Gitee From a91080fd1fe919a42e817866a08f4a30bbf04238 Mon Sep 17 00:00:00 2001 From: Szymon Heidrich Date: Sat, 2 Apr 2022 07:03:49 +0000 Subject: [PATCH 114/340] USB: gadget: validate endpoint index for xilinx udc mainline inclusion from mainline-v5.17-rc4 commit 7f14c7227f342d9932f9b918893c8814f86d2a0d category: bugfix bugzilla: 186427, https://gitee.com/openeuler/kernel/issues/I4ZWQA CVE: CVE-2022-27223 -------------------------------- Assure that host may not manipulate the index to point past endpoint array. Signed-off-by: Szymon Heidrich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Lijun Fang Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Yongqiang Liu --- drivers/usb/gadget/udc/udc-xilinx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/gadget/udc/udc-xilinx.c b/drivers/usb/gadget/udc/udc-xilinx.c index 6407e433bc78..72f1bc6a680e 100644 --- a/drivers/usb/gadget/udc/udc-xilinx.c +++ b/drivers/usb/gadget/udc/udc-xilinx.c @@ -1613,6 +1613,8 @@ static void xudc_getstatus(struct xusb_udc *udc) break; case USB_RECIP_ENDPOINT: epnum = udc->setup.wIndex & USB_ENDPOINT_NUMBER_MASK; + if (epnum >= XUSB_MAX_ENDPOINTS) + goto stall; target_ep = &udc->ep[epnum]; epcfgreg = udc->read_fn(udc->addr + target_ep->offset); halt = epcfgreg & XUSB_EP_CFG_STALL_MASK; @@ -1680,6 +1682,10 @@ static void xudc_set_clear_feature(struct xusb_udc *udc) case USB_RECIP_ENDPOINT: if (!udc->setup.wValue) { endpoint = udc->setup.wIndex & USB_ENDPOINT_NUMBER_MASK; + if (endpoint >= XUSB_MAX_ENDPOINTS) { + xudc_ep0_stall(udc); + return; + } target_ep = &udc->ep[endpoint]; outinbit = udc->setup.wIndex & USB_ENDPOINT_DIR_MASK; outinbit = outinbit >> 7; -- Gitee From a5e62d73dee762e486e26df79fe05df5c2e843c6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 2 Apr 2022 07:03:50 +0000 Subject: [PATCH 115/340] Revert "swiotlb: rework "fix info leak with DMA_FROM_DEVICE"" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.18-rc1 commit bddac7c1e02ba47f0570e494c9289acea3062cc1 category: bugfix bugzilla: 186478, https://gitee.com/openeuler/kernel/issues/I4Z86P CVE: CVE-2022-0854 -------------------------------- This reverts commit aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13. It turns out this breaks at least the ath9k wireless driver, and possibly others. What the ath9k driver does on packet receive is to set up the DMA transfer with: int ath_rx_init(..) .. bf->bf_buf_addr = dma_map_single(sc->dev, skb->data, common->rx_bufsize, DMA_FROM_DEVICE); and then the receive logic (through ath_rx_tasklet()) will fetch incoming packets static bool ath_edma_get_buffers(..) .. dma_sync_single_for_cpu(sc->dev, bf->bf_buf_addr, common->rx_bufsize, DMA_FROM_DEVICE); ret = ath9k_hw_process_rxdesc_edma(ah, rs, skb->data); if (ret == -EINPROGRESS) { /*let device gain the buffer again*/ dma_sync_single_for_device(sc->dev, bf->bf_buf_addr, common->rx_bufsize, DMA_FROM_DEVICE); return false; } and it's worth noting how that first DMA sync: dma_sync_single_for_cpu(..DMA_FROM_DEVICE); is there to make sure the CPU can read the DMA buffer (possibly by copying it from the bounce buffer area, or by doing some cache flush). The iommu correctly turns that into a "copy from bounce bufer" so that the driver can look at the state of the packets. In the meantime, the device may continue to write to the DMA buffer, but we at least have a snapshot of the state due to that first DMA sync. But that _second_ DMA sync: dma_sync_single_for_device(..DMA_FROM_DEVICE); is telling the DMA mapping that the CPU wasn't interested in the area because the packet wasn't there. In the case of a DMA bounce buffer, that is a no-op. Note how it's not a sync for the CPU (the "for_device()" part), and it's not a sync for data written by the CPU (the "DMA_FROM_DEVICE" part). Or rather, it _should_ be a no-op. That's what commit aa6f8dcbab47 broke: it made the code bounce the buffer unconditionally, and changed the DMA_FROM_DEVICE to just unconditionally and illogically be DMA_TO_DEVICE. [ Side note: purely within the confines of the swiotlb driver it wasn't entirely illogical: The reason it did that odd DMA_FROM_DEVICE -> DMA_TO_DEVICE conversion thing is because inside the swiotlb driver, it uses just a swiotlb_bounce() helper that doesn't care about the whole distinction of who the sync is for - only which direction to bounce. So it took the "sync for device" to mean that the CPU must have been the one writing, and thought it meant DMA_TO_DEVICE. ] Also note how the commentary in that commit was wrong, probably due to that whole confusion, claiming that the commit makes the swiotlb code "bounce unconditionally (that is, also when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale data from the swiotlb buffer" which is nonsensical for two reasons: - that "also when dir == DMA_TO_DEVICE" is nonsensical, as that was exactly when it always did - and should do - the bounce. - since this is a sync for the device (not for the CPU), we're clearly fundamentally not coping back stale data from the bounce buffers at all, because we'd be copying *to* the bounce buffers. So that commit was just very confused. It confused the direction of the synchronization (to the device, not the cpu) with the direction of the DMA (from the device). Reported-and-bisected-by: Oleksandr Natalenko Reported-by: Olha Cherevyk Cc: Halil Pasic Cc: Christoph Hellwig Cc: Kalle Valo Cc: Robin Murphy Cc: Toke Høiland-Jørgensen Cc: Maxime Bizon Cc: Johannes Berg Signed-off-by: Linus Torvalds Conflicts: Documentation/core-api/dma-attributes.rst include/linux/dma-mapping.h kernel/dma/swiotlb.c Signed-off-by: Liu Shixin Reviewed-by: Xiu Jianfeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- Documentation/DMA-attributes.txt | 8 ++++++++ include/linux/dma-mapping.h | 8 ++++++++ kernel/dma/swiotlb.c | 26 ++++++++++---------------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt index 8f8d97f65d73..34742fc00ec0 100644 --- a/Documentation/DMA-attributes.txt +++ b/Documentation/DMA-attributes.txt @@ -156,3 +156,11 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_OVERWRITE +------------------ + +This is a hint to the DMA-mapping subsystem that the device is expected to +overwrite the entire mapped size, thus the caller does not require any of the +previous buffer contents to be preserved. This allows bounce-buffering +implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 9d39d97e09f4..aa23cade6991 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -70,6 +70,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. * It can be given to a device to use as a DMA source or target. A CPU cannot diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b6eee8ea867b..1624d12b60af 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -583,14 +583,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, */ for (i = 0; i < nslots; i++) io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); - /* - * When dir == DMA_FROM_DEVICE we could omit the copy from the orig - * to the tlb buffer, if we knew for sure the device will - * overwirte the entire current content. But we don't. Thus - * unconditional bounce may prevent leaking swiotlb content (i.e. - * kernel memory) to user-space. - */ - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || + dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + return tlb_addr; } @@ -682,14 +679,11 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, BUG_ON(dir != DMA_TO_DEVICE); break; case SYNC_FOR_DEVICE: - /* - * Unconditional bounce is necessary to avoid corruption on - * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite - * the whole lengt of the bounce buffer. - */ - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_TO_DEVICE); - BUG_ON(!valid_dma_direction(dir)); + if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); break; default: BUG(); -- Gitee From 3e10969090c6f2aab0f0136aaf59d315e12b2f31 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 2 Apr 2022 07:03:51 +0000 Subject: [PATCH 116/340] Reinstate some of "swiotlb: rework "fix info leak with DMA_FROM_DEVICE"" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.18-rc1 commit 901c7280ca0d5e2b4a8929fbe0bfb007ac2a6544 category: bugfix bugzilla: 186478, https://gitee.com/openeuler/kernel/issues/I4Z86P CVE: CVE-2022-0854 -------------------------------- Halil Pasic points out [1] that the full revert of that commit (revert in bddac7c1e02b), and that a partial revert that only reverts the problematic case, but still keeps some of the cleanups is probably better.  And that partial revert [2] had already been verified by Oleksandr Natalenko to also fix the issue, I had just missed that in the long discussion. So let's reinstate the cleanups from commit aa6f8dcbab47 ("swiotlb: rework "fix info leak with DMA_FROM_DEVICE""), and effectively only revert the part that caused problems. Link: https://lore.kernel.org/all/20220328013731.017ae3e3.pasic@linux.ibm.com/ [1] Link: https://lore.kernel.org/all/20220324055732.GB12078@lst.de/ [2] Link: https://lore.kernel.org/all/4386660.LvFx2qVVIh@natalenko.name/ [3] Suggested-by: Halil Pasic Tested-by: Oleksandr Natalenko Cc: Christoph Hellwig" Signed-off-by: Linus Torvalds Conflicts: Documentation/core-api/dma-attributes.rst include/linux/dma-mapping.h kernel/dma/swiotlb.c Signed-off-by: Liu Shixin Reviewed-by: Xiu Jianfeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- Documentation/DMA-attributes.txt | 8 -------- include/linux/dma-mapping.h | 8 -------- kernel/dma/swiotlb.c | 13 ++++++++----- 3 files changed, 8 insertions(+), 21 deletions(-) diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt index 34742fc00ec0..8f8d97f65d73 100644 --- a/Documentation/DMA-attributes.txt +++ b/Documentation/DMA-attributes.txt @@ -156,11 +156,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). - -DMA_ATTR_OVERWRITE ------------------- - -This is a hint to the DMA-mapping subsystem that the device is expected to -overwrite the entire mapped size, thus the caller does not require any of the -previous buffer contents to be preserved. This allows bounce-buffering -implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index aa23cade6991..9d39d97e09f4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -70,14 +70,6 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) -/* - * This is a hint to the DMA-mapping subsystem that the device is expected - * to overwrite the entire mapped size, thus the caller does not require any - * of the previous buffer contents to be preserved. This allows - * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. - */ -#define DMA_ATTR_OVERWRITE (1UL << 10) - /* * A dma_addr_t can hold any valid DMA or bus address for the platform. * It can be given to a device to use as a DMA source or target. A CPU cannot diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1624d12b60af..694e7ccea659 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -583,11 +583,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, */ for (i = 0; i < nslots; i++) io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || - dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); - + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); return tlb_addr; } -- Gitee From fb59563c5a1f53a87ba7f9f6a341156b6e2d2c76 Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Sat, 2 Apr 2022 07:03:52 +0000 Subject: [PATCH 117/340] sched/fair: Add qos_throttle_list node in struct cfs_rq hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I50PPU CVE: NA ----------------------------------------------------------------- when unthrottle a cfs_rq at distribute_cfs_runtime(), another cpu may re-throttle this cfs_rq at qos_throttle_cfs_rq() before access the cfs_rq->throttle_list.next, but meanwhile, qos throttle will attach the cfs_rq throttle_list node to percpu qos_throttled_cfs_rq, it will change cfs_rq->throttle_list.next and cause panic or hardlockup at distribute_cfs_runtime(). Fix it by adding a qos_throttle_list node in struct cfs_rq, and qos throttle disuse the cfs_rq->throttle_list. Signed-off-by: Zhang Qiao Reviewed-by: zheng zucheng Reviewed-by: Chen Hui Signed-off-by: Yongqiang Liu --- kernel/sched/fair.c | 10 +++++++--- kernel/sched/sched.h | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 52029f3a7439..4c7c0144a852 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5059,6 +5059,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_QOS_SCHED + INIT_LIST_HEAD(&cfs_rq->qos_throttled_list); +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6925,7 +6928,8 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); - list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); + list_add(&cfs_rq->qos_throttled_list, + &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); } static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) @@ -6943,7 +6947,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; - list_del_init(&cfs_rq->throttled_list); + list_del_init(&cfs_rq->qos_throttled_list); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -6984,7 +6988,7 @@ static int __unthrottle_qos_cfs_rqs(int cpu) int res = 0; list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), - throttled_list) { + qos_throttled_list) { if (cfs_rq_throttled(cfs_rq)) { unthrottle_qos_cfs_rq(cfs_rq); res++; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1aaff1aa89f6..ae3068153093 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -592,6 +592,11 @@ struct cfs_rq { #endif KABI_RESERVE(2) +#ifndef __GENKSYMS__ +#ifdef CONFIG_QOS_SCHED + struct list_head qos_throttled_list; +#endif +#endif }; static inline int rt_bandwidth_enabled(void) -- Gitee From 5597897f03c0077f24712c33c579beb15986b1fe Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sat, 2 Apr 2022 07:49:18 +0000 Subject: [PATCH 118/340] serial: 8250: Fix max baud limit in generic 8250 port stable inclusion from linux-4.19.130 commit 0eeaf62981ecc79e8395ca8caa1570eaf3a12257 category: bugfix bugzilla: 89545 CVE: N/A ------------------------------------------------ [ Upstream commit 7b668c064ec33f3d687c3a413d05e355172e6c92 ] Standard 8250 UART ports are designed in a way so they can communicate with baud rates up to 1/16 of a reference frequency. It's expected from most of the currently supported UART controllers. That's why the former version of serial8250_get_baud_rate() method called uart_get_baud_rate() with min and max baud rates passed as (port->uartclk / 16 / UART_DIV_MAX) and ((port->uartclk + tolerance) / 16) respectively. Doing otherwise, like it was suggested in commit ("serial: 8250_mtk: support big baud rate."), caused acceptance of bauds, which was higher than the normal UART controllers actually supported. As a result if some user-space program requested to set a baud greater than (uartclk / 16) it would have been permitted without truncation, but then serial8250_get_divisor(baud) (which calls uart_get_divisor() to get the reference clock divisor) would have returned a zero divisor. Setting zero divisor will cause an unpredictable effect varying from chip to chip. In case of DW APB UART the communications just stop. Lets fix this problem by getting back the limitation of (uartclk + tolerance) / 16 maximum baud supported by the generic 8250 port. Mediatek 8250 UART ports driver developer shouldn't have touched it in the first place notably seeing he already provided a custom version of set_termios() callback in that glue-driver which took into account the extended baud rate values and accordingly updated the standard and vendor-specific divisor latch registers anyway. Fixes: 81bb549fdf14 ("serial: 8250_mtk: support big baud rate.") Signed-off-by: Serge Semin Cc: Alexey Malahov Cc: Thomas Bogendoerfer Cc: Paul Burton Cc: Ralf Baechle Cc: Arnd Bergmann Cc: Long Cheng Cc: Andy Shevchenko Cc: Maxime Ripard Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: linux-mips@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mediatek@lists.infradead.org Link: https://lore.kernel.org/r/20200506233136.11842-2-Sergey.Semin@baikalelectronics.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Yi Yang Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/tty/serial/8250/8250_port.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 1867c2546cd9..3a0604b025e9 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -2647,6 +2647,8 @@ static unsigned int serial8250_get_baud_rate(struct uart_port *port, struct ktermios *termios, struct ktermios *old) { + unsigned int tolerance = port->uartclk / 100; + /* * Ask the core to calculate the divisor for us. * Allow 1% tolerance at the upper limit so uart clks marginally @@ -2655,7 +2657,7 @@ static unsigned int serial8250_get_baud_rate(struct uart_port *port, */ return uart_get_baud_rate(port, termios, old, port->uartclk / 16 / UART_DIV_MAX, - port->uartclk); + (port->uartclk + tolerance) / 16); } void -- Gitee From df45decfa77ac71641a2324bb08196d164f1e8ee Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 11 Apr 2022 10:35:37 +0800 Subject: [PATCH 119/340] printk: Convert a use of sprintf to snprintf in console_unlock mainline inclusion from mainline-v5.8-rc1 commit 5661dd95a2958634485bb1a53f90a6ab621d4b0c category: bugfix bugzilla: 91291 CVE: N/A -------------------------------- When CONFIG_PRINTK is disabled (e.g. when building allnoconfig), clang warns: ../kernel/printk/printk.c:2416:10: warning: 'sprintf' will always overflow; destination buffer has size 0, but format string expands to at least 33 [-Wfortify-source] len = sprintf(text, ^ 1 warning generated. It is not wrong; text has a zero size when CONFIG_PRINTK is disabled because LOG_LINE_MAX and PREFIX_MAX are both zero. Change to snprintf so that this case is explicitly handled without any risk of overflow. Link: https://github.com/ClangBuiltLinux/linux/issues/846 Link: https://github.com/llvm/llvm-project/commit/6d485ff455ea2b37fef9e06e426dae6c1241b231 Link: http://lkml.kernel.org/r/20200130221644.2273-1-natechancellor@gmail.com Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Cc: clang-built-linux@googlegroups.com Signed-off-by: Nathan Chancellor Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Signed-off-by: Yi Yang Reviewed-by: Xiu Jianfeng Signed-off-by: Laibin Qiu --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0fe45941b5c7..c645a7221d0b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2423,9 +2423,9 @@ void console_unlock(void) printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); + len = snprintf(text, sizeof(text), + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; -- Gitee From 39a38fe3a705b89c418f63d6730ad5c65876ff93 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 11 Apr 2022 10:35:38 +0800 Subject: [PATCH 120/340] tty: don't crash in tty_init_dev when missing tty_port mainline inclusion from mainline-v5.5-rc1 commit 2ae0b31e0faced43c011ce3221f2535721cb6a66 category: bugfix bugzilla: 82118 CVE: N/A -------------------------------- We currently warn the user when tty->port is not set in tty_init_dev yet. The warning says that the kernel will crash later. And it really will only few lines below at: tty->port->itty = tty; So be nice and avoid the crash -- return an error instead. And update the warning. Signed-off-by: Jiri Slaby Cc: Sudip Mukherjee Link: https://lore.kernel.org/r/20191122101721.7222-1-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yi Yang Reviewed-by: Xiu Jianfeng Signed-off-by: Laibin Qiu --- drivers/tty/tty_io.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 3025c39ba6b1..dc522e994e80 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1343,9 +1343,12 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) if (!tty->port) tty->port = driver->ports[idx]; - WARN_RATELIMIT(!tty->port, - "%s: %s driver does not set tty->port. This will crash the kernel later. Fix the driver!\n", - __func__, tty->driver->name); + if (WARN_RATELIMIT(!tty->port, + "%s: %s driver does not set tty->port. This would crash the kernel. Fix the driver!\n", + __func__, tty->driver->name)) { + retval = -EINVAL; + goto err_release_lock; + } retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) -- Gitee From 5485fe7b1bb5be0515a0177fb26eb71d14ec3941 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Mon, 11 Apr 2022 10:35:39 +0800 Subject: [PATCH 121/340] tty: fix crash in release_tty if tty->port is not set mainline inclusion from mainline-v5.10-rc3 commit 4466d6d2f80c1193e0845d110277c56da77a6418 category: bugfix bugzilla: 82118 CVE: N/A -------------------------------- Commit 2ae0b31e0face ("tty: don't crash in tty_init_dev when missing tty_port") didn't fully prevent the crash as the cleanup path in tty_init_dev() calls release_tty() which dereferences tty->port without checking it for non-null. Add tty->port checks to release_tty to avoid the kernel crash. Fixes: 2ae0b31e0face ("tty: don't crash in tty_init_dev when missing tty_port") Signed-off-by: Matthias Reichl Link: https://lore.kernel.org/r/20201105123432.4448-1-hias@horus.com Cc: stable Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yi Yang Reviewed-by: Xiu Jianfeng Signed-off-by: Laibin Qiu --- drivers/tty/tty_io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index dc522e994e80..708be6258609 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1512,10 +1512,12 @@ static void release_tty(struct tty_struct *tty, int idx) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); - tty->port->itty = NULL; + if (tty->port) + tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; - tty_buffer_cancel_work(tty->port); + if (tty->port) + tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); -- Gitee From 4b4a6296372d5df1afa1f11db56f72c13abfc5cd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 12 Apr 2022 15:55:21 +0800 Subject: [PATCH 122/340] PM: wakeup: simplify the output logic of pm_show_wakelocks() stable inclusion from linux-4.19.228 commit cf94a8a3ff3d7aa06742f1beb5166fa45ba72122 -------------------------------- commit c9d967b2ce40d71e968eb839f36c936b8a9cf1ea upstream. The buffer handling in pm_show_wakelocks() is tricky, and hopefully correct. Ensure it really is correct by using sysfs_emit_at() which handles all of the tricky string handling logic in a PAGE_SIZE buffer for us automatically as this is a sysfs file being read from. Reviewed-by: Lee Jones Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/power/wakelock.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 4210152e56f0..aad7c8fcb22f 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -39,23 +39,19 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) { struct rb_node *node; struct wakelock *wl; - char *str = buf; - char *end = buf + PAGE_SIZE; + int len = 0; mutex_lock(&wakelocks_lock); for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); if (wl->ws.active == show_active) - str += scnprintf(str, end - str, "%s ", wl->name); + len += sysfs_emit_at(buf, len, "%s ", wl->name); } - if (str > buf) - str--; - - str += scnprintf(str, end - str, "\n"); + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); - return (str - buf); + return len; } #if CONFIG_PM_WAKELOCKS_LIMIT > 0 -- Gitee From cead80db71f5bcd0ee035c33837d7e1b750d022e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Apr 2022 15:55:22 +0800 Subject: [PATCH 123/340] netfilter: nft_payload: do not update layer 4 checksum when mangling fragments stable inclusion from linux-4.19.228 commit a622b1fdfe1c725a6e9cf34d04e92f5c36a99606 -------------------------------- commit 4e1860a3863707e8177329c006d10f9e37e097a8 upstream. IP fragments do not come with the transport header, hence skip bogus layer 4 checksum updates. Fixes: 1814096980bb ("netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader fields") Reported-and-tested-by: Steffen Weinreich Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/netfilter/nft_payload.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index b1a9f330a51f..fd87216bc0a9 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -194,6 +194,9 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, struct sk_buff *skb, unsigned int *l4csum_offset) { + if (pkt->xt.fragoff) + return -1; + switch (pkt->tprot) { case IPPROTO_TCP: *l4csum_offset = offsetof(struct tcphdr, check); -- Gitee From 1dd442ba8899cc84d8d882c8367c620796cf6d7f Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 12 Apr 2022 15:55:23 +0800 Subject: [PATCH 124/340] serial: 8250: of: Fix mapped region size when using reg-offset property stable inclusion from linux-4.19.228 commit ff6cb8fc12edee1a4cfe5f2915b06a4d6a5f9d1f -------------------------------- commit d06b1cf28297e27127d3da54753a3a01a2fa2f28 upstream. 8250_of supports a reg-offset property which is intended to handle cases where the device registers start at an offset inside the region of memory allocated to the device. The Xilinx 16550 UART, for which this support was initially added, requires this. However, the code did not adjust the overall size of the mapped region accordingly, causing the driver to request an area of memory past the end of the device's allocation. For example, if the UART was allocated an address of 0xb0130000, size of 0x10000 and reg-offset of 0x1000 in the device tree, the region of memory reserved was b0131000-b0140fff, which caused the driver for the region starting at b0140000 to fail to probe. Fix this by subtracting reg-offset from the mapped region size. Fixes: b912b5e2cfb3 ([POWERPC] Xilinx: of_serial support for Xilinx uart 16550.) Cc: stable Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220112194214.881844-1-robert.hancock@calian.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/serial/8250/8250_of.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c index 2488de1c4bc4..dcd2105e2b60 100644 --- a/drivers/tty/serial/8250/8250_of.c +++ b/drivers/tty/serial/8250/8250_of.c @@ -104,8 +104,17 @@ static int of_platform_serial_setup(struct platform_device *ofdev, port->mapsize = resource_size(&resource); /* Check for shifted address mapping */ - if (of_property_read_u32(np, "reg-offset", &prop) == 0) + if (of_property_read_u32(np, "reg-offset", &prop) == 0) { + if (prop >= port->mapsize) { + dev_warn(&ofdev->dev, "reg-offset %u exceeds region size %pa\n", + prop, &port->mapsize); + ret = -EINVAL; + goto err_unprepare; + } + port->mapbase += prop; + port->mapsize -= prop; + } port->iotype = UPIO_MEM; if (of_property_read_u32(np, "reg-io-width", &prop) == 0) { -- Gitee From 4fb55bc78cc043e919c58f52489eafdc9acb311b Mon Sep 17 00:00:00 2001 From: Valentin Caron Date: Tue, 12 Apr 2022 15:55:24 +0800 Subject: [PATCH 125/340] serial: stm32: fix software flow control transfer stable inclusion from linux-4.19.228 commit a41398a5a1a0dde794c65ef1fa6b2323e2db68bc -------------------------------- commit 037b91ec7729524107982e36ec4b40f9b174f7a2 upstream. x_char is ignored by stm32_usart_start_tx() when xmit buffer is empty. Fix start_tx condition to allow x_char to be sent. Fixes: 48a6092fb41f ("serial: stm32-usart: Add STM32 USART Driver") Cc: stable Signed-off-by: Erwan Le Ray Signed-off-by: Valentin Caron Link: https://lore.kernel.org/r/20220111164441.6178-3-valentin.caron@foss.st.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/serial/stm32-usart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index e8d7a7bb4339..f761e1038c34 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -506,7 +506,7 @@ static void stm32_start_tx(struct uart_port *port) { struct circ_buf *xmit = &port->state->xmit; - if (uart_circ_empty(xmit)) + if (uart_circ_empty(xmit) && !port->x_char) return; stm32_transmit_chars(port); -- Gitee From 9526d3d74a63d2e1b4805e6b0ada3333910fefe9 Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Tue, 12 Apr 2022 15:55:25 +0800 Subject: [PATCH 126/340] tty: n_gsm: fix SW flow control encoding/handling stable inclusion from linux-4.19.228 commit 47bc9be8d6d94a25a598b4056078925dd6d15851 -------------------------------- commit 8838b2af23caf1ff0610caef2795d6668a013b2d upstream. n_gsm is based on the 3GPP 07.010 and its newer version is the 3GPP 27.010. See https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=1516 The changes from 07.010 to 27.010 are non-functional. Therefore, I refer to the newer 27.010 here. Chapter 5.2.7.3 states that DC1 (XON) and DC3 (XOFF) are the control characters defined in ISO/IEC 646. These shall be quoted if seen in the data stream to avoid interpretation as flow control characters. ISO/IEC 646 refers to the set of ISO standards described as the ISO 7-bit coded character set for information interchange. Its final version is also known as ITU T.50. See https://www.itu.int/rec/T-REC-T.50-199209-I/en To abide the standard it is needed to quote DC1 and DC3 correctly if these are seen as data bytes and not as control characters. The current implementation already tries to enforce this but fails to catch all defined cases. 3GPP 27.010 chapter 5.2.7.3 clearly states that the most significant bit shall be ignored for DC1 and DC3 handling. The current implementation handles only the case with the most significant bit set 0. Cases in which DC1 and DC3 have the most significant bit set 1 are left unhandled. This patch fixes this by masking the data bytes with ISO_IEC_646_MASK (only the 7 least significant bits set 1) before comparing them with XON (a.k.a. DC1) and XOFF (a.k.a. DC3) when testing which byte values need quotation via byte stuffing. Fixes: e1eaea46bb40 ("tty: n_gsm line discipline") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220120101857.2509-1-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/n_gsm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 86b7e20ffd7f..7de54c78a7b9 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -313,6 +313,7 @@ static struct tty_driver *gsm_tty_driver; #define GSM1_ESCAPE_BITS 0x20 #define XON 0x11 #define XOFF 0x13 +#define ISO_IEC_646_MASK 0x7F static const struct tty_port_operations gsm_port_ops; @@ -531,7 +532,8 @@ static int gsm_stuff_frame(const u8 *input, u8 *output, int len) int olen = 0; while (len--) { if (*input == GSM1_SOF || *input == GSM1_ESCAPE - || *input == XON || *input == XOFF) { + || (*input & ISO_IEC_646_MASK) == XON + || (*input & ISO_IEC_646_MASK) == XOFF) { *output++ = GSM1_ESCAPE; *output++ = *input++ ^ GSM1_ESCAPE_BITS; olen++; -- Gitee From 2069e835d6c7440764f80db4ba33ad1e88b1bdc8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 12 Apr 2022 15:55:26 +0800 Subject: [PATCH 127/340] ipv6_tunnel: Rate limit warning messages stable inclusion from linux-4.19.228 commit e3e01ed702c7b5f5d306c2ef566d30b84aa57126 -------------------------------- commit 6cee105e7f2ced596373951d9ea08dacc3883c68 upstream. The warning messages can be invoked from the data path for every packet transmitted through an ip6gre netdev, leading to high CPU utilization. Fix that by rate limiting the messages. Fixes: 09c6bbf090ec ("[IPV6]: Do mandatory IPv6 tunnel endpoint checks in realtime") Reported-by: Maksym Yaremchuk Tested-by: Maksym Yaremchuk Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv6/ip6_tunnel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 35c127c3eee7..b647a4037679 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1005,14 +1005,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) - pr_warn("%s xmit: Local address not yet configured!\n", - p->name); + pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", + p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) - pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", - p->name); + pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", + p->name); else ret = 1; rcu_read_unlock(); -- Gitee From 711eff8f8f8ef22caf8282b927633d9aebc06ffb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 12 Apr 2022 15:55:27 +0800 Subject: [PATCH 128/340] ping: fix the sk_bound_dev_if match in ping_lookup stable inclusion from linux-4.19.228 commit a921507836877c4e28084f7bfd35ac2608321268 -------------------------------- commit 2afc3b5a31f9edf3ef0f374f5d70610c79c93a42 upstream. When 'ping' changes to use PING socket instead of RAW socket by: # sysctl -w net.ipv4.ping_group_range="0 100" the selftests 'router_broadcast.sh' will fail, as such command # ip vrf exec vrf-h1 ping -I veth0 198.51.100.255 -b can't receive the response skb by the PING socket. It's caused by mismatch of sk_bound_dev_if and dif in ping_rcv() when looking up the PING socket, as dif is vrf-h1 if dif's master was set to vrf-h1. This patch is to fix this regression by also checking the sk_bound_dev_if against sdif so that the packets can stil be received even if the socket is not bound to the vrf device but to the real iif. Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Reported-by: Hangbin Liu Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/ping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 862744c28548..276442443d32 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -225,7 +225,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) continue; } - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && + sk->sk_bound_dev_if != inet_sdif(skb)) continue; sock_hold(sk); -- Gitee From b8ab968eaf1b13f1b7a591aea1df65dff3696123 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:55:28 +0800 Subject: [PATCH 129/340] ipv4: avoid using shared IP generator for connected sockets stable inclusion from linux-4.19.228 commit eb04c6d1ec67e30f3aa5ef82112cbfdbddfd4f65 -------------------------------- commit 23f57406b82de51809d5812afd96f210f8b627f3 upstream. ip_select_ident_segs() has been very conservative about using the connected socket private generator only for packets with IP_DF set, claiming it was needed for some VJ compression implementations. As mentioned in this referenced document, this can be abused. (Ref: Off-Path TCP Exploits of the Mixed IPID Assignment) Before switching to pure random IPID generation and possibly hurt some workloads, lets use the private inet socket generator. Not only this will remove one vulnerability, this will also improve performance of TCP flows using pmtudisc==IP_PMTUDISC_DONT Fixes: 73f156a6e8c1 ("inetpeer: get rid of ip_id_count") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reported-by: Ray Che Cc: Willy Tarreau Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- include/net/ip.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 17766586dfa0..621eeb9f142e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -437,19 +437,18 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); + /* We had many attacks based on IPID, use the private + * generator as much as we can. + */ + if (sk && inet_sk(sk)->inet_daddr) { + iph->id = htons(inet_sk(sk)->inet_id); + inet_sk(sk)->inet_id += segs; + return; + } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { - /* This is only to work around buggy Windows95/2000 - * VJ compression implementations. If the ID field - * does not change, they drop every other packet in - * a TCP stream using header compression. - */ - if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; - } else { - iph->id = 0; - } + iph->id = 0; } else { + /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } -- Gitee From 891412134fbd7389e6aed6b117f62e7db657513a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:55:29 +0800 Subject: [PATCH 130/340] ipv6: annotate accesses to fn->fn_sernum stable inclusion from linux-4.19.228 commit 243e898452adcf6055c9b2632becef7c59f02874 -------------------------------- commit aafc2e3285c2d7a79b7ee15221c19fbeca7b1509 upstream. struct fib6_node's fn_sernum field can be read while other threads change it. Add READ_ONCE()/WRITE_ONCE() annotations. Do not change existing smp barriers in fib6_get_cookie_safe() and __fib6_update_sernum_upto_root() syzbot reported: BUG: KCSAN: data-race in fib6_clean_node / inet6_csk_route_socket write to 0xffff88813df62e2c of 4 bytes by task 1920 on cpu 1: fib6_clean_node+0xc2/0x260 net/ipv6/ip6_fib.c:2178 fib6_walk_continue+0x38e/0x430 net/ipv6/ip6_fib.c:2112 fib6_walk net/ipv6/ip6_fib.c:2160 [inline] fib6_clean_tree net/ipv6/ip6_fib.c:2240 [inline] __fib6_clean_all+0x1a9/0x2e0 net/ipv6/ip6_fib.c:2256 fib6_flush_trees+0x6c/0x80 net/ipv6/ip6_fib.c:2281 rt_genid_bump_ipv6 include/net/net_namespace.h:488 [inline] addrconf_dad_completed+0x57f/0x870 net/ipv6/addrconf.c:4230 addrconf_dad_work+0x908/0x1170 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:359 ret_from_fork+0x1f/0x30 read to 0xffff88813df62e2c of 4 bytes by task 15701 on cpu 0: fib6_get_cookie_safe include/net/ip6_fib.h:285 [inline] rt6_get_cookie include/net/ip6_fib.h:306 [inline] ip6_dst_store include/net/ip6_route.h:234 [inline] inet6_csk_route_socket+0x352/0x3c0 net/ipv6/inet6_connection_sock.c:109 inet6_csk_xmit+0x91/0x1e0 net/ipv6/inet6_connection_sock.c:121 __tcp_transmit_skb+0x1323/0x1840 net/ipv4/tcp_output.c:1402 tcp_transmit_skb net/ipv4/tcp_output.c:1420 [inline] tcp_write_xmit+0x1450/0x4460 net/ipv4/tcp_output.c:2680 __tcp_push_pending_frames+0x68/0x1c0 net/ipv4/tcp_output.c:2864 tcp_push+0x2d9/0x2f0 net/ipv4/tcp.c:725 mptcp_push_release net/mptcp/protocol.c:1491 [inline] __mptcp_push_pending+0x46c/0x490 net/mptcp/protocol.c:1578 mptcp_sendmsg+0x9ec/0xa50 net/mptcp/protocol.c:1764 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:643 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] kernel_sendmsg+0x97/0xd0 net/socket.c:745 sock_no_sendpage+0x84/0xb0 net/core/sock.c:3086 inet_sendpage+0x9d/0xc0 net/ipv4/af_inet.c:834 kernel_sendpage+0x187/0x200 net/socket.c:3492 sock_sendpage+0x5a/0x70 net/socket.c:1007 pipe_to_sendpage+0x128/0x160 fs/splice.c:364 splice_from_pipe_feed fs/splice.c:418 [inline] __splice_from_pipe+0x207/0x500 fs/splice.c:562 splice_from_pipe fs/splice.c:597 [inline] generic_splice_sendpage+0x94/0xd0 fs/splice.c:746 do_splice_from fs/splice.c:767 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:936 splice_direct_to_actor+0x345/0x650 fs/splice.c:891 do_splice_direct+0x106/0x190 fs/splice.c:979 do_sendfile+0x675/0xc40 fs/read_write.c:1245 __do_sys_sendfile64 fs/read_write.c:1310 [inline] __se_sys_sendfile64 fs/read_write.c:1296 [inline] __x64_sys_sendfile64+0x102/0x140 fs/read_write.c:1296 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000026f -> 0x00000271 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 15701 Comm: syz-executor.2 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 The Fixes tag I chose is probably arbitrary, I do not think we need to backport this patch to older kernels. Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220120174112.1126644-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- include/net/ip6_fib.h | 2 +- net/ipv6/ip6_fib.c | 23 +++++++++++++---------- net/ipv6/route.c | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 62c936230cc8..b4fea9dd589d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -243,7 +243,7 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, fn = rcu_dereference(f6i->fib6_node); if (fn) { - *cookie = fn->fn_sernum; + *cookie = READ_ONCE(fn->fn_sernum); /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ smp_rmb(); status = true; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e0e464b72c1f..5ff67cb8b6ac 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -112,7 +112,7 @@ void fib6_update_sernum(struct net *net, struct fib6_info *f6i) fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) - fn->fn_sernum = fib6_new_sernum(net); + WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* @@ -544,12 +544,13 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; - cb->args[5] = w->root->fn_sernum; + cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { - if (cb->args[5] != w->root->fn_sernum) { + int sernum = READ_ONCE(w->root->fn_sernum); + if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ - cb->args[5] = w->root->fn_sernum; + cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; @@ -1203,7 +1204,7 @@ static void __fib6_update_sernum_upto_root(struct fib6_info *rt, /* paired with smp_rmb() in rt6_get_cookie_safe() */ smp_wmb(); while (fn) { - fn->fn_sernum = sernum; + WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } @@ -1983,8 +1984,8 @@ static int fib6_clean_node(struct fib6_walker *w) }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && - w->node->fn_sernum != c->sernum) - w->node->fn_sernum = c->sernum; + READ_ONCE(w->node->fn_sernum) != c->sernum) + WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); @@ -2332,7 +2333,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; - iter->sernum = iter->w.root->fn_sernum; + iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } @@ -2360,8 +2361,10 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { - if (iter->sernum != iter->w.root->fn_sernum) { - iter->sernum = iter->w.root->fn_sernum; + int sernum = READ_ONCE(iter->w.root->fn_sernum); + + if (iter->sernum != sernum) { + iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 80dd55c436fa..8efb03cc6f24 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2320,7 +2320,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) - fn->fn_sernum = -1; + WRITE_ONCE(fn->fn_sernum, -1); } } rcu_read_unlock(); -- Gitee From ff03d08f44db106a7a4cea906cda6029eeaf479c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Apr 2022 15:55:30 +0800 Subject: [PATCH 131/340] NFS: Ensure the server has an up to date ctime before hardlinking stable inclusion from linux-4.19.228 commit ddeea0002d17984d66bb63f9b66b96fd604756ad -------------------------------- [ Upstream commit 204975036b34f55237bc44c8a302a88468ef21b5 ] Creating a hard link is required by POSIX to update the file ctime, so ensure that the file data is synced to disk so that we don't clobber the updated ctime by writing back after creating the hard link. Fixes: 9f7682728728 ("NFS: Move the delegation return down into nfs4_proc_link()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 67ece39c7800..ede35472280d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2052,6 +2052,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) trace_nfs_link_enter(inode, dir, dentry); d_drop(dentry); + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { ihold(inode); -- Gitee From a2e1b59ecb07811efb3b07a6ec50723b8e05acea Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Apr 2022 15:55:31 +0800 Subject: [PATCH 132/340] NFS: Ensure the server has an up to date ctime before renaming stable inclusion from linux-4.19.228 commit 8b5c9de150c62f686ff274039b97fe3a2297ada6 -------------------------------- [ Upstream commit 6ff9d99bb88faebf134ca668842349d9718e5464 ] Renaming a file is required by POSIX to update the file ctime, so ensure that the file data is synced to disk so that we don't clobber the updated ctime by writing back after creating the hard link. Fixes: f2c2c552f119 ("NFS: Move delegation recall into the NFSv4 callback for rename_setup()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ede35472280d..72e5ae8cc4ff 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2142,6 +2142,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + if (S_ISREG(old_inode->i_mode)) + nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); -- Gitee From 4540e9afdf2d1b7f3463d3a8ac7c92a95926130a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 12 Apr 2022 15:55:32 +0800 Subject: [PATCH 133/340] phylib: fix potential use-after-free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from linux-4.19.228 commit 67d271760b037ce0806d687ee6057edc8afd4205 -------------------------------- [ Upstream commit cbda1b16687580d5beee38273f6241ae3725960c ] Commit bafbdd527d56 ("phylib: Add device reset GPIO support") added call to phy_device_reset(phydev) after the put_device() call in phy_detach(). The comment before the put_device() call says that the phydev might go away with put_device(). Fix potential use-after-free by calling phy_device_reset() before put_device(). Fixes: bafbdd527d56 ("phylib: Add device reset GPIO support") Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220119162748.32418-1-kabel@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/net/phy/phy_device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 1117355313bc..2de2cbed7c5f 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1154,6 +1154,9 @@ void phy_detach(struct phy_device *phydev) phydev->mdio.dev.driver == &genphy_driver.mdiodrv.driver) device_release_driver(&phydev->mdio.dev); + /* Assert the reset signal */ + phy_device_reset(phydev, 1); + /* * The phydev might go away on the put_device() below, so avoid * a use-after-free bug by reading the underlying bus first. @@ -1163,9 +1166,6 @@ void phy_detach(struct phy_device *phydev) put_device(&phydev->mdio.dev); if (ndev_owner != bus->owner) module_put(bus->owner); - - /* Assert the reset signal */ - phy_device_reset(phydev, 1); } EXPORT_SYMBOL(phy_detach); -- Gitee From 7282d57e98b3bbceba7aa3a9eaf9c2e3a2d6c5b7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:55:33 +0800 Subject: [PATCH 134/340] ipv4: raw: lock the socket in raw_bind() stable inclusion from linux-4.19.228 commit 3edc526019751c551a05b3ef465d07678fd239de -------------------------------- [ Upstream commit 153a0d187e767c68733b8e9f46218eb1f41ab902 ] For some reason, raw_bind() forgot to lock the socket. BUG: KCSAN: data-race in __ip4_datagram_connect / raw_bind write to 0xffff8881170d4308 of 4 bytes by task 5466 on cpu 0: raw_bind+0x1b0/0x250 net/ipv4/raw.c:739 inet_bind+0x56/0xa0 net/ipv4/af_inet.c:443 __sys_bind+0x14b/0x1b0 net/socket.c:1697 __do_sys_bind net/socket.c:1708 [inline] __se_sys_bind net/socket.c:1706 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1706 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881170d4308 of 4 bytes by task 5468 on cpu 1: __ip4_datagram_connect+0xb7/0x7b0 net/ipv4/datagram.c:39 ip4_datagram_connect+0x2a/0x40 net/ipv4/datagram.c:89 inet_dgram_connect+0x107/0x190 net/ipv4/af_inet.c:576 __sys_connect_file net/socket.c:1900 [inline] __sys_connect+0x197/0x1b0 net/socket.c:1917 __do_sys_connect net/socket.c:1927 [inline] __se_sys_connect net/socket.c:1924 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:1924 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0x0003007f Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 5468 Comm: syz-executor.5 Not tainted 5.17.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/raw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bc3da7796e8a..a24f94abe14f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -724,6 +724,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) int ret = -EINVAL; int chk_addr_ret; + lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; @@ -743,7 +744,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; -out: return ret; +out: + release_sock(sk); + return ret; } /* -- Gitee From a816d3f78a6bda458969bf09ec173882591dbc84 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:55:34 +0800 Subject: [PATCH 135/340] ipv4: tcp: send zero IPID in SYNACK messages stable inclusion from linux-4.19.228 commit 08ed3cabc07e9f6035466e33fbdfad1978e05e06 -------------------------------- [ Upstream commit 970a5a3ea86da637471d3cd04d513a0755aba4bf ] In commit 431280eebed9 ("ipv4: tcp: send zero IPID for RST and ACK sent in SYN-RECV and TIME-WAIT state") we took care of some ctl packets sent by TCP. It turns out we need to use a similar strategy for SYNACK packets. By default, they carry IP_DF and IPID==0, but there are ways to ask them to use the hashed IP ident generator and thus be used to build off-path attacks. (Ref: Off-Path TCP Exploits of the Mixed IPID Assignment) One of this way is to force (before listener is started) echo 1 >/proc/sys/net/ipv4/ip_no_pmtu_disc Another way is using forged ICMP ICMP_FRAG_NEEDED with a very small MTU (like 68) to force a false return from ip_dont_fragment() In this patch, ip_build_and_send_pkt() uses the following heuristics. 1) Most SYNACK packets are smaller than IPV4_MIN_MTU and therefore can use IP_DF regardless of the listener or route pmtu setting. 2) In case the SYNACK packet is bigger than IPV4_MIN_MTU, we use prandom_u32() generator instead of the IPv4 hashed ident one. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: Ray Che Reviewed-by: David Ahern Cc: Geoff Alexander Cc: Willy Tarreau Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/ip_output.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e63905f7f6f9..1a6c05908584 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -160,12 +160,19 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - if (ip_dont_fragment(sk, &rt->dst)) { + /* Do not bother generating IPID for small packets (eg SYNACK) */ + if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) { iph->frag_off = htons(IP_DF); iph->id = 0; } else { iph->frag_off = 0; - __ip_select_ident(net, iph, 1); + /* TCP packets here are SYNACK with fat IPv4/TCP options. + * Avoid using the hashed IP ident generator. + */ + if (sk->sk_protocol == IPPROTO_TCP) + iph->id = (__force __be16)prandom_u32(); + else + __ip_select_ident(net, iph, 1); } if (opt && opt->opt.optlen) { -- Gitee From 7865710a1bc0a52cbed5636c530cba97902f3847 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:55:35 +0800 Subject: [PATCH 136/340] rtnetlink: make sure to refresh master_dev/m_ops in __rtnl_newlink() stable inclusion from linux-4.19.228 commit a01e60a1ec6bef9be471fb7182a33c6d6f124e93 -------------------------------- commit c6f6f2444bdbe0079e41914a35081530d0409963 upstream. While looking at one unrelated syzbot bug, I found the replay logic in __rtnl_newlink() to potentially trigger use-after-free. It is better to clear master_dev and m_ops inside the loop, in case we have to replay it. Fixes: ba7d49b1f0f8 ("rtnetlink: provide api for getting and setting slave info") Signed-off-by: Eric Dumazet Cc: Jiri Pirko Link: https://lore.kernel.org/r/20220201012106.216495-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 83de32e34bb5..6cd0dbe42baa 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2936,9 +2936,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + const struct rtnl_link_ops *m_ops; struct net_device *dev; - struct net_device *master_dev = NULL; + struct net_device *master_dev; struct ifinfomsg *ifm; char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; @@ -2973,6 +2973,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = NULL; } + master_dev = NULL; + m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) -- Gitee From b1e20c74a3f035c6a93f2fd9e179a1af6c4df6af Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:55:36 +0800 Subject: [PATCH 137/340] af_packet: fix data-race in packet_setsockopt / packet_setsockopt stable inclusion from linux-4.19.228 commit 1f0c712832907baef27d13721b3ca2e515d39ecf -------------------------------- commit e42e70ad6ae2ae511a6143d2e8da929366e58bd9 upstream. When packet_setsockopt( PACKET_FANOUT_DATA ) reads po->fanout, no lock is held, meaning that another thread can change po->fanout. Given that po->fanout can only be set once during the socket lifetime (it is only cleared from fanout_release()), we can use READ_ONCE()/WRITE_ONCE() to document the race. BUG: KCSAN: data-race in packet_setsockopt / packet_setsockopt write to 0xffff88813ae8e300 of 8 bytes by task 14653 on cpu 0: fanout_add net/packet/af_packet.c:1791 [inline] packet_setsockopt+0x22fe/0x24a0 net/packet/af_packet.c:3931 __sys_setsockopt+0x209/0x2a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88813ae8e300 of 8 bytes by task 14654 on cpu 1: packet_setsockopt+0x691/0x24a0 net/packet/af_packet.c:3935 __sys_setsockopt+0x209/0x2a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000000000000000 -> 0xffff888106f8c000 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 14654 Comm: syz-executor.3 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 47dceb8ecdc1 ("packet: add classic BPF fanout mode") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Link: https://lore.kernel.org/r/20220201022358.330621-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/packet/af_packet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fbf7d5ef83c7..bacb1bbe53e2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1729,7 +1729,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = -ENOSPC; if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { __dev_remove_pack(&po->prot_hook); - po->fanout = match; + + /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ + WRITE_ONCE(po->fanout, match); + po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); @@ -3871,7 +3874,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv } case PACKET_FANOUT_DATA: { - if (!po->fanout) + /* Paired with the WRITE_ONCE() in fanout_add() */ + if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); -- Gitee From f4af292adec47119344da5c87127724ecc7191d5 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 12 Apr 2022 15:55:37 +0800 Subject: [PATCH 138/340] block: bio-integrity: Advance seed correctly for larger interval sizes stable inclusion from linux-4.19.228 commit d21b6bbc78442bf490e7215ed620ec0878856a0e -------------------------------- commit b13e0c71856817fca67159b11abac350e41289f5 upstream. Commit 309a62fa3a9e ("bio-integrity: bio_integrity_advance must update integrity seed") added code to update the integrity seed value when advancing a bio. However, it failed to take into account that the integrity interval might be larger than the 512-byte block layer sector size. This broke bio splitting on PI devices with 4KB logical blocks. The seed value should be advanced by bio_integrity_intervals() and not the number of sectors. Cc: Dmitry Monakhov Cc: stable@vger.kernel.org Fixes: 309a62fa3a9e ("bio-integrity: bio_integrity_advance must update integrity seed") Tested-by: Dmitry Ivanov Reported-by: Alexey Lyashkov Signed-off-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220204034209.4193-1-martin.petersen@oracle.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 0b96220d0efd..2e22a3f7466a 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -399,7 +399,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - bip->bip_iter.bi_sector += bytes_done >> 9; + bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } EXPORT_SYMBOL(bio_integrity_advance); -- Gitee From 75624b5bc0485778aef9f46ebbf5e5df3c0d054c Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 12 Apr 2022 15:55:38 +0800 Subject: [PATCH 139/340] iommu/vt-d: Fix potential memory leak in intel_setup_irq_remapping() stable inclusion from linux-4.19.228 commit a31cb1f0fb6caf46ffe88c41252b6b7a4ee062d9 -------------------------------- commit 99e675d473eb8cf2deac1376a0f840222fc1adcf upstream. After commit e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated"). For tear down scenario, fn is only freed after fail to allocate ir_domain, though it also should be freed in case dmar_enable_qi returns error. Besides free fn, irq_domain and ir_msi_domain need to be removed as well if intel_setup_irq_remapping fails to enable queued invalidation. Improve the rewinding path by add out_free_ir_domain and out_free_fwnode lables per Baolu's suggestion. Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") Suggested-by: Lu Baolu Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220119063640.16864-1-guoqing.jiang@linux.dev Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220128031002.2219155-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/iommu/intel_irq_remapping.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 967450bd421a..276005451914 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -539,7 +539,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) irq_domain_free_fwnode(fn); if (!iommu->ir_domain) { pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); - goto out_free_bitmap; + goto out_free_fwnode; } iommu->ir_msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain, @@ -563,7 +563,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (dmar_enable_qi(iommu)) { pr_err("Failed to enable queued invalidation\n"); - goto out_free_bitmap; + goto out_free_ir_domain; } } @@ -587,6 +587,14 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) return 0; +out_free_ir_domain: + if (iommu->ir_msi_domain) + irq_domain_remove(iommu->ir_msi_domain); + iommu->ir_msi_domain = NULL; + irq_domain_remove(iommu->ir_domain); + iommu->ir_domain = NULL; +out_free_fwnode: + irq_domain_free_fwnode(fn); out_free_bitmap: kfree(bitmap); out_free_pages: -- Gitee From 0b3f58fe48e9825546b647b5faf58a3701427a5e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 Apr 2022 15:55:39 +0800 Subject: [PATCH 140/340] iommu/amd: Fix loop timeout issue in iommu_ga_log_enable() stable inclusion from linux-4.19.228 commit bd2a6021b8778bf671a2f9739241543398a7c853 -------------------------------- commit 9b45a7738eec52bf0f5d8d3d54e822962781c5f2 upstream. The polling loop for the register change in iommu_ga_log_enable() needs to have a udelay() in it. Otherwise the CPU might be faster than the IOMMU hardware and wrongly trigger the WARN_ON() further down the code stream. Use a 10us for udelay(), has there is some hardware where activation of the GA log can take more than a 100ms. A future optimization should move the activation check of the GA log to the point where it gets used for the first time. But that is a bigger change and not suitable for a fix. Fixes: 8bda0cfbdc1a ("iommu/amd: Detect and initialize guest vAPIC log") Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20220204115537.3894-1-joro@8bytes.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/iommu/amd_iommu_init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 1e9a5da562f0..29bf91e50d41 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -769,6 +770,7 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu) status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & (MMIO_STATUS_GALOG_RUN_MASK)) break; + udelay(10); } if (i >= LOOP_TIMEOUT) -- Gitee From 5a1955f471aa77d43afc8bf50ca84dfe1f03b7d1 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 12 Apr 2022 15:55:40 +0800 Subject: [PATCH 141/340] ext4: fix error handling in ext4_restore_inline_data() stable inclusion from linux-4.19.228 commit e9368c941a26098a199ee357f0370d49ac30b47f -------------------------------- commit 897026aaa73eb2517dfea8d147f20ddb0b813044 upstream. While running "./check -I 200 generic/475" it sometimes gives below kernel BUG(). Ideally we should not call ext4_write_inline_data() if ext4_create_inline_data() has failed. [73131.453234] kernel BUG at fs/ext4/inline.c:223! 212 static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, 213 void *buffer, loff_t pos, unsigned int len) 214 { <...> 223 BUG_ON(!EXT4_I(inode)->i_inline_off); 224 BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); This patch handles the error and prints out a emergency msg saying potential data loss for the given inode (since we couldn't restore the original inline_data due to some previous error). [ 9571.070313] EXT4-fs (dm-0): error restoring inline_data for inode -- potential data loss! (inode 1703982, error -30) Reported-by: Eric Whitney Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/9f4cd7dfd54fa58ff27270881823d94ddf78dd07.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/ext4/inline.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f36936bcd260..abb71ba0b1e6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1126,7 +1126,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { - ext4_create_inline_data(handle, inode, inline_size); + int ret; + + ret = ext4_create_inline_data(handle, inode, inline_size); + if (ret) { + ext4_msg(inode->i_sb, KERN_EMERG, + "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + return; + } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } -- Gitee From d3178c7c22d632371f3c18710f414100eb5e7f6d Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 12 Apr 2022 15:55:41 +0800 Subject: [PATCH 142/340] integrity: check the return value of audit_log_start() stable inclusion from linux-4.19.230 commit e9fa71bab4de31155eebb3074e29f49d6748fd1a -------------------------------- commit 83230351c523b04ff8a029a4bdf97d881ecb96fc upstream. audit_log_start() returns audit_buffer pointer on success or NULL on error, so it is better to check the return value of it. Fixes: 3323eec921ef ("integrity: IMA as an integrity service provider") Signed-off-by: Xiaoke Wang Cc: Reviewed-by: Paul Moore Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- security/integrity/integrity_audit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c index 82c98f7d217e..d03fbdfc972e 100644 --- a/security/integrity/integrity_audit.c +++ b/security/integrity/integrity_audit.c @@ -39,6 +39,8 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode, return; ab = audit_log_start(audit_context(), GFP_KERNEL, audit_msgno); + if (!ab) + return; audit_log_format(ab, "pid=%d uid=%u auid=%u ses=%u", task_pid_nr(current), from_kuid(&init_user_ns, current_cred()->uid), -- Gitee From a1bb26418f5b62f17a40d6c1a1af93a88c547450 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Tue, 12 Apr 2022 15:55:42 +0800 Subject: [PATCH 143/340] ima: Remove ima_policy file before directory stable inclusion from linux-4.19.230 commit 2c3beddb39ebe4411497629442f594133f29e0cd -------------------------------- commit f7333b9572d0559e00352a926c92f29f061b4569 upstream. The removal of ima_dir currently fails since ima_policy still exists, so remove the ima_policy file before removing the directory. Fixes: 4af4662fa4a9 ("integrity: IMA policy") Signed-off-by: Stefan Berger Cc: Acked-by: Christian Brauner Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- security/integrity/ima/ima_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 604cdac63d84..38bd565b9da9 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -497,12 +497,12 @@ int __init ima_fs_init(void) return 0; out: + securityfs_remove(ima_policy); securityfs_remove(violations); securityfs_remove(runtime_measurements_count); securityfs_remove(ascii_runtime_measurements); securityfs_remove(binary_runtime_measurements); securityfs_remove(ima_symlink); securityfs_remove(ima_dir); - securityfs_remove(ima_policy); return -1; } -- Gitee From e694730f5007fce98c53d1eb0847d5c65972dc27 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 12 Apr 2022 15:55:43 +0800 Subject: [PATCH 144/340] ima: Allow template selection with ima_template[_fmt]= after ima_hash= stable inclusion from linux-4.19.230 commit 4d9eb5b2ef21c496597f09355d9d6f5508731e98 -------------------------------- commit bb8e52e4906f148c2faf6656b5106cf7233e9301 upstream. Commit c2426d2ad5027 ("ima: added support for new kernel cmdline parameter ima_template_fmt") introduced an additional check on the ima_template variable to avoid multiple template selection. Unfortunately, ima_template could be also set by the setup function of the ima_hash= parameter, when it calls ima_template_desc_current(). This causes attempts to choose a new template with ima_template= or with ima_template_fmt=, after ima_hash=, to be ignored. Achieve the goal of the commit mentioned with the new static variable template_setup_done, so that template selection requests after ima_hash= are not ignored. Finally, call ima_init_template_list(), if not already done, to initialize the list of templates before lookup_template_desc() is called. Reported-by: Guo Zihua Signed-off-by: Roberto Sassu Cc: stable@vger.kernel.org Fixes: c2426d2ad5027 ("ima: added support for new kernel cmdline parameter ima_template_fmt") Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- security/integrity/ima/ima_template.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 30db39b23804..4dfdccce497b 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -31,6 +31,7 @@ static struct ima_template_desc builtin_templates[] = { static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); +static int template_setup_done; static struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, @@ -57,10 +58,11 @@ static int __init ima_template_setup(char *str) struct ima_template_desc *template_desc; int template_len = strlen(str); - if (ima_template) + if (template_setup_done) return 1; - ima_init_template_list(); + if (!ima_template) + ima_init_template_list(); /* * Verify that a template with the supplied name exists. @@ -84,6 +86,7 @@ static int __init ima_template_setup(char *str) } ima_template = template_desc; + template_setup_done = 1; return 1; } __setup("ima_template=", ima_template_setup); @@ -92,7 +95,7 @@ static int __init ima_template_fmt_setup(char *str) { int num_templates = ARRAY_SIZE(builtin_templates); - if (ima_template) + if (template_setup_done) return 1; if (template_desc_init_fields(str, NULL, NULL) < 0) { @@ -103,6 +106,7 @@ static int __init ima_template_fmt_setup(char *str) builtin_templates[num_templates - 1].fmt = str; ima_template = builtin_templates + num_templates - 1; + template_setup_done = 1; return 1; } -- Gitee From f22022eb03eb7bdc3a2dc638a0017e45e5a21dd5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Apr 2022 15:55:44 +0800 Subject: [PATCH 145/340] NFS: Fix initialisation of nfs_client cl_flags field stable inclusion from linux-4.19.230 commit 57e13bdd9634a48c0bdbf988858144350321c0bf -------------------------------- commit 468d126dab45718feeb728319be20bd869a5eaa7 upstream. For some long forgotten reason, the nfs_client cl_flags field is initialised in nfs_get_client() instead of being initialised at allocation time. This quirk was harmless until we moved the call to nfs_create_rpc_client(). Fixes: dd99e9f98fbf ("NFSv4: Initialise connection to the server in nfs4_alloc_client()") Cc: stable@vger.kernel.org # 4.8.x Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b8adf3467786..7d02dc52209d 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -180,6 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); + clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_net = get_net(cl_init->net); @@ -427,7 +428,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); - new->cl_flags = cl_init->init_flags; return rpc_ops->init_client(new, cl_init); } -- Gitee From 9e9b47e9d07a0043f5935a2c8d6e7a062c05445d Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 12 Apr 2022 15:55:45 +0800 Subject: [PATCH 146/340] NFSv4 only print the label when its queried stable inclusion from linux-4.19.230 commit 1789f59f1779c99d0756b40036c62a7519f53543 -------------------------------- [ Upstream commit 2c52c8376db7160a1dd8a681c61c9258405ef143 ] When the bitmask of the attributes doesn't include the security label, don't bother printing it. Since the label might not be null terminated, adjust the printing format accordingly. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/nfs4xdr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index feebe80f2092..982fac53e0c6 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4300,10 +4300,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); + if (label && label->label) + dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n", + __func__, label->len, (char *)label->label, + label->len, label->pi, label->lfs); } - if (label && label->label) - dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, - (char *)label->label, label->len, label->pi, label->lfs); return status; out_overflow: -- Gitee From 008ae6aed08efa8a03d777b50e74189f79975f77 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 12 Apr 2022 15:55:46 +0800 Subject: [PATCH 147/340] nfs: nfs4clinet: check the return value of kstrdup() stable inclusion from linux-4.19.230 commit 2c9587f72ff4b502c2fd15eb3ccff388eae12d07 -------------------------------- [ Upstream commit fbd2057e5329d3502a27491190237b6be52a1cb6 ] kstrdup() returns NULL when some internal memory errors happen, it is better to check the return value of it so to catch the memory error in time. Signed-off-by: Xiaoke Wang Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/nfs4client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 36282e48cff6..60999b261666 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1276,8 +1276,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, } nfs_put_client(clp); - if (server->nfs_client->cl_hostname == NULL) + if (server->nfs_client->cl_hostname == NULL) { server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + if (server->nfs_client->cl_hostname == NULL) + return -ENOMEM; + } nfs_server_insert_lists(server); return nfs_probe_destination(server); -- Gitee From 4cb6ed69b775dfec22ea1f071817ca32be43a67f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Apr 2022 15:55:47 +0800 Subject: [PATCH 148/340] NFSv4.1: Fix uninitialised variable in devicenotify stable inclusion from linux-4.19.230 commit e6b0f9177c43ff9ad0f1a6dd3639c383373911b7 -------------------------------- [ Upstream commit b05bf5c63b326ce1da84ef42498d8e0e292e694c ] When decode_devicenotify_args() exits with no entries, we need to ensure that the struct cb_devicenotifyargs is initialised to { 0, NULL } in order to avoid problems in nfs4_callback_devicenotify(). Reported-by: Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/callback.h | 2 +- fs/nfs/callback_proc.c | 2 +- fs/nfs/callback_xdr.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 8f34daf85f70..5d5227ce4d91 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -168,7 +168,7 @@ struct cb_devicenotifyitem { }; struct cb_devicenotifyargs { - int ndevs; + uint32_t ndevs; struct cb_devicenotifyitem *devs; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index bcc51f131a49..868d66ed8bcf 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -364,7 +364,7 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; - int i; + uint32_t i; __be32 res = 0; struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 57558a8d92e9..76aa1b456c52 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -268,11 +268,9 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, void *argp) { struct cb_devicenotifyargs *args = argp; + uint32_t tmp, n, i; __be32 *p; __be32 status = 0; - u32 tmp; - int n, i; - args->ndevs = 0; /* Num of device notifications */ p = read_buf(xdr, sizeof(uint32_t)); @@ -281,7 +279,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, goto out; } n = ntohl(*p++); - if (n <= 0) + if (n == 0) goto out; if (n > ULONG_MAX / sizeof(*args->devs)) { status = htonl(NFS4ERR_BADXDR); @@ -339,19 +337,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, dev->cbd_immediate = 0; } - args->ndevs++; - dprintk("%s: type %d layout 0x%x immediate %d\n", __func__, dev->cbd_notify_type, dev->cbd_layout_type, dev->cbd_immediate); } + args->ndevs = n; + dprintk("%s: ndevs %d\n", __func__, args->ndevs); + return 0; +err: + kfree(args->devs); out: + args->devs = NULL; + args->ndevs = 0; dprintk("%s: status %d ndevs %d\n", __func__, ntohl(status), args->ndevs); return status; -err: - kfree(args->devs); - goto out; } static __be32 decode_sessionid(struct xdr_stream *xdr, -- Gitee From a3877f4a4a470cf50728ee56132d12cdbe68cd18 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 12 Apr 2022 15:55:48 +0800 Subject: [PATCH 149/340] NFSv4 remove zero number of fs_locations entries error check stable inclusion from linux-4.19.230 commit 152f7db416c4bfcc8fc01e55cae60f63489580fa -------------------------------- [ Upstream commit 90e12a3191040bd3854d3e236c35921e4e92a044 ] Remove the check for the zero length fs_locations reply in the xdr decoding, and instead check for that in the migration code. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/nfs4state.c | 3 +++ fs/nfs/nfs4xdr.c | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5ff516104699..f53f8f17629b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2064,6 +2064,9 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) } result = -NFS4ERR_NXIO; + if (!locations->nlocations) + goto out; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 982fac53e0c6..e8917fc393a4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3753,8 +3753,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(!p)) goto out_overflow; n = be32_to_cpup(p); - if (n <= 0) - goto out_eio; for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; struct nfs4_fs_location *loc; -- Gitee From 4c895d112ed1b4ba14887931cd63118e84aaac44 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 12 Apr 2022 15:55:49 +0800 Subject: [PATCH 150/340] NFSv4 expose nfs_parse_server_name function stable inclusion from linux-4.19.230 commit 42dc3cf3174ec714732825638947e26923d9ea2a -------------------------------- [ Upstream commit f5b27cc6761e27ee6387a24df1a99ca77b360fea ] Make nfs_parse_server_name available outside of nfs4namespace.c. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/nfs4_fs.h | 3 ++- fs/nfs/nfs4namespace.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4f3b5add052b..d22176d87448 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -277,7 +277,8 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); - +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_async_handle_error(struct rpc_task *task, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 24f06dcc2b08..936c412be28e 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,8 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct net *net) +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net) { ssize_t ret; -- Gitee From 33d0429868bbffc5401e45102861aaf77df00047 Mon Sep 17 00:00:00 2001 From: ZouMingzhe Date: Tue, 12 Apr 2022 15:55:50 +0800 Subject: [PATCH 151/340] scsi: target: iscsi: Make sure the np under each tpg is unique stable inclusion from linux-4.19.230 commit ded80123b84253ecc6c6cfd1fbbbd99f51c984ec -------------------------------- [ Upstream commit a861790afaa8b6369eee8a88c5d5d73f5799c0c6 ] iscsit_tpg_check_network_portal() has nested for_each loops and is supposed to return true when a match is found. However, the tpg loop will still continue after existing the tpg_np loop. If this tpg_np is not the last the match value will be changed. Break the outer loop after finding a match and make sure the np under each tpg is unique. Link: https://lore.kernel.org/r/20220111054742.19582-1-mingzhe.zou@easystack.cn Signed-off-by: ZouMingzhe Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/target/iscsi/iscsi_target_tpg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/target/iscsi/iscsi_target_tpg.c b/drivers/target/iscsi/iscsi_target_tpg.c index 101d62105c93..f3671ffdf149 100644 --- a/drivers/target/iscsi/iscsi_target_tpg.c +++ b/drivers/target/iscsi/iscsi_target_tpg.c @@ -451,6 +451,9 @@ static bool iscsit_tpg_check_network_portal( break; } spin_unlock(&tpg->tpg_np_lock); + + if (match) + break; } spin_unlock(&tiqn->tiqn_tpg_lock); -- Gitee From 25c90cb2e67c2decff30bf8a4bd9871a6dfa54fc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Apr 2022 15:55:51 +0800 Subject: [PATCH 152/340] bpf: Add kconfig knob for disabling unpriv bpf by default stable inclusion from linux-4.19.230 commit 07e7f7cc619d15645e45d04b1c99550c6d292e9c -------------------------------- commit 08389d888287c3823f80b0216766b71e17f0aba5 upstream. Add a kconfig knob which allows for unprivileged bpf to be disabled by default. If set, the knob sets /proc/sys/kernel/unprivileged_bpf_disabled to value of 2. This still allows a transition of 2 -> {0,1} through an admin. Similarly, this also still keeps 1 -> {1} behavior intact, so that once set to permanently disabled, it cannot be undone aside from a reboot. We've also added extra2 with max of 2 for the procfs handler, so that an admin still has a chance to toggle between 0 <-> 2. Either way, as an additional alternative, applications can make use of CAP_BPF that we added a while ago. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/74ec548079189e4e4dffaeb42b8987bb3c852eee.1620765074.git.daniel@iogearbox.net [fllinden@amazon.com: backported to 4.19] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- Documentation/sysctl/kernel.txt | 21 +++++++++++++++++++++ init/Kconfig | 10 ++++++++++ kernel/bpf/syscall.c | 3 ++- kernel/sysctl.c | 29 +++++++++++++++++++++++++---- 4 files changed, 58 insertions(+), 5 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 7e33d039ea17..1fa03508ff5a 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -95,6 +95,7 @@ show up in /proc/sys/kernel: - sysctl_writes_strict - tainted - threads-max +- unprivileged_bpf_disabled - unknown_nmi_panic - watchdog - watchdog_thresh @@ -1072,6 +1073,26 @@ available RAM pages threads-max is reduced accordingly. ============================================================== +unprivileged_bpf_disabled: + +Writing 1 to this entry will disable unprivileged calls to bpf(); +once disabled, calling bpf() without CAP_SYS_ADMIN will return +-EPERM. Once set to 1, this can't be cleared from the running kernel +anymore. + +Writing 2 to this entry will also disable unprivileged calls to bpf(), +however, an admin can still change this setting later on, if needed, by +writing 0 or 1 to this entry. + +If BPF_UNPRIV_DEFAULT_OFF is enabled in the kernel config, then this +entry will default to 2 instead of 0. + + 0 - Unprivileged calls to bpf() are enabled + 1 - Unprivileged calls to bpf() are disabled without recovery + 2 - Unprivileged calls to bpf() are disabled + +============================================================== + unknown_nmi_panic: The value in this file affects behavior of handling NMI. When the diff --git a/init/Kconfig b/init/Kconfig index 1a0b15c5a82b..cef0e5fdc901 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1539,6 +1539,16 @@ config BPF_JIT_ALWAYS_ON Enables BPF JIT and removes BPF interpreter to avoid speculative execution of BPF instructions by the interpreter +config BPF_UNPRIV_DEFAULT_OFF + bool "Disable unprivileged BPF by default" + depends on BPF_SYSCALL + help + Disables unprivileged BPF by default by setting the corresponding + /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can + still reenable it by setting it to 0 later on, or permanently + disable it by setting it to 1 (from which no other transition to + 0 is possible anymore). + config USERFAULTFD bool "Enable userfaultfd() system call" select ANON_INODES diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 30253fb4254e..b11852ea1822 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -48,7 +48,8 @@ static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); -int sysctl_unprivileged_bpf_disabled __read_mostly; +int sysctl_unprivileged_bpf_disabled __read_mostly = + IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _ops) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5445dead9911..d2eebbcdff52 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -257,6 +257,28 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, #endif +#ifdef CONFIG_BPF_SYSCALL +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + return ret; +} +#endif + static struct ctl_table kern_table[]; static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; @@ -1247,10 +1269,9 @@ static struct ctl_table kern_table[] = { .data = &sysctl_unprivileged_bpf_disabled, .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .proc_handler = bpf_unpriv_handler, + .extra1 = &zero, + .extra2 = &two, }, #endif #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) -- Gitee From ab97b84dab5098364fef19fa61a016ba89d5be66 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Tue, 12 Apr 2022 15:55:52 +0800 Subject: [PATCH 153/340] bonding: pair enable_port with slave_arr_updates stable inclusion from linux-4.19.230 commit c92b23d934f008aea3ec3239d951f6c4cc0c4422 -------------------------------- [ Upstream commit 23de0d7b6f0e3f9a6283a882594c479949da1120 ] When 803.2ad mode enables a participating port, it should update the slave-array. I have observed that the member links are participating and are part of the active aggregator while the traffic is egressing via only one member link (in a case where two links are participating). Via kprobes I discovered that slave-arr has only one link added while the other participating link wasn't part of the slave-arr. I couldn't see what caused that situation but the simple code-walk through provided me hints that the enable_port wasn't always associated with the slave-array update. Fixes: ee6377147409 ("bonding: Simplify the xmit function for modes that use xmit_hash") Signed-off-by: Mahesh Bandewar Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/20220207222901.1795287-1-maheshb@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/net/bonding/bond_3ad.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 93dfcef8afc4..035923876c61 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1012,8 +1012,8 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) if (port->aggregator && port->aggregator->is_active && !__port_is_enabled(port)) { - __enable_port(port); + *update_slave_arr = true; } } break; @@ -1760,6 +1760,7 @@ static void ad_agg_selection_logic(struct aggregator *agg, port = port->next_port_in_aggregator) { __enable_port(port); } + *update_slave_arr = true; } } -- Gitee From a7f727331bad16bf5b30724cf22b1ea1a275c31e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:55:53 +0800 Subject: [PATCH 154/340] ipmr,ip6mr: acquire RTNL before calling ip[6]mr_free_table() on failure path stable inclusion from linux-4.19.230 commit 12b6703e9546902c56b4b9048b893ad49d62bdd4 -------------------------------- [ Upstream commit 5611a00697c8ecc5aad04392bea629e9d6a20463 ] ip[6]mr_free_table() can only be called under RTNL lock. RTNL: assertion failed at net/core/dev.c (10367) WARNING: CPU: 1 PID: 5890 at net/core/dev.c:10367 unregister_netdevice_many+0x1246/0x1850 net/core/dev.c:10367 Modules linked in: CPU: 1 PID: 5890 Comm: syz-executor.2 Not tainted 5.16.0-syzkaller-11627-g422ee58dc0ef #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:unregister_netdevice_many+0x1246/0x1850 net/core/dev.c:10367 Code: 0f 85 9b ee ff ff e8 69 07 4b fa ba 7f 28 00 00 48 c7 c6 00 90 ae 8a 48 c7 c7 40 90 ae 8a c6 05 6d b1 51 06 01 e8 8c 90 d8 01 <0f> 0b e9 70 ee ff ff e8 3e 07 4b fa 4c 89 e7 e8 86 2a 59 fa e9 ee RSP: 0018:ffffc900046ff6e0 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888050f51d00 RSI: ffffffff815fa008 RDI: fffff520008dfece RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff815f3d6e R11: 0000000000000000 R12: 00000000fffffff4 R13: dffffc0000000000 R14: ffffc900046ff750 R15: ffff88807b7dc000 FS: 00007f4ab736e700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee0b4f8990 CR3: 000000001e7d2000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mroute_clean_tables+0x244/0xb40 net/ipv6/ip6mr.c:1509 ip6mr_free_table net/ipv6/ip6mr.c:389 [inline] ip6mr_rules_init net/ipv6/ip6mr.c:246 [inline] ip6mr_net_init net/ipv6/ip6mr.c:1306 [inline] ip6mr_net_init+0x3f0/0x4e0 net/ipv6/ip6mr.c:1298 ops_init+0xaf/0x470 net/core/net_namespace.c:140 setup_net+0x54f/0xbb0 net/core/net_namespace.c:331 copy_net_ns+0x318/0x760 net/core/net_namespace.c:475 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 copy_namespaces+0x391/0x450 kernel/nsproxy.c:178 copy_process+0x2e0c/0x7300 kernel/fork.c:2167 kernel_clone+0xe7/0xab0 kernel/fork.c:2555 __do_sys_clone+0xc8/0x110 kernel/fork.c:2672 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f4ab89f9059 Code: Unable to access opcode bytes at RIP 0x7f4ab89f902f. RSP: 002b:00007f4ab736e118 EFLAGS: 00000206 ORIG_RAX: 0000000000000038 RAX: ffffffffffffffda RBX: 00007f4ab8b0bf60 RCX: 00007f4ab89f9059 RDX: 0000000020000280 RSI: 0000000020000270 RDI: 0000000040200000 RBP: 00007f4ab8a5308d R08: 0000000020000300 R09: 0000000020000300 R10: 00000000200002c0 R11: 0000000000000206 R12: 0000000000000000 R13: 00007ffc3977cc1f R14: 00007f4ab736e300 R15: 0000000000022000 Fixes: f243e5a7859a ("ipmr,ip6mr: call ip6mr_free_table() on failure path") Signed-off-by: Eric Dumazet Cc: Cong Wang Reported-by: syzbot Link: https://lore.kernel.org/r/20220208053451.2885398-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/ipmr.c | 2 ++ net/ipv6/ip6mr.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d235478d9ca3..2085af224a41 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -265,7 +265,9 @@ static int __net_init ipmr_rules_init(struct net *net) return 0; err2: + rtnl_lock(); ipmr_free_table(mrt); + rtnl_unlock(); err1: fib_rules_unregister(ops); return err; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 9a8923e6a8fa..d392a72cd784 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -251,7 +251,9 @@ static int __net_init ip6mr_rules_init(struct net *net) return 0; err2: + rtnl_lock(); ip6mr_free_table(mrt); + rtnl_unlock(); err1: fib_rules_unregister(ops); return err; -- Gitee From ceaed295fb486954206baecd6a7ecca0d8c92a05 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 12 Apr 2022 15:55:54 +0800 Subject: [PATCH 155/340] net: do not keep the dst cache when uncloning an skb dst and its metadata stable inclusion from linux-4.19.230 commit 040e92ea3d7d6f27c1b71d6502e35c54a0939cb7 -------------------------------- [ Upstream commit cfc56f85e72f5b9c5c5be26dc2b16518d36a7868 ] When uncloning an skb dst and its associated metadata a new dst+metadata is allocated and the tunnel information from the old metadata is copied over there. The issue is the tunnel metadata has references to cached dst, which are copied along the way. When a dst+metadata refcount drops to 0 the metadata is freed including the cached dst entries. As they are also referenced in the initial dst+metadata, this ends up in UaFs. In practice the above did not happen because of another issue, the dst+metadata was never freed because its refcount never dropped to 0 (this will be fixed in a subsequent patch). Fix this by initializing the dst cache after copying the tunnel information from the old metadata to also unshare the dst cache. Fixes: d71785ffc7e7 ("net: add dst_cache to ovs vxlan lwtunnel") Cc: Paolo Abeni Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Antoine Tenart Acked-by: Paolo Abeni Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- include/net/dst_metadata.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 14efa0ded75d..b997e0c1e362 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -123,6 +123,19 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size); +#ifdef CONFIG_DST_CACHE + /* Unclone the dst cache if there is one */ + if (new_md->u.tun_info.dst_cache.cache) { + int ret; + + ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); + if (ret) { + metadata_dst_free(new_md); + return ERR_PTR(ret); + } + } +#endif + skb_dst_drop(skb); dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); -- Gitee From 0c4296c06307760841438d2672f506071f125553 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 12 Apr 2022 15:55:55 +0800 Subject: [PATCH 156/340] net: fix a memleak when uncloning an skb dst and its metadata stable inclusion from linux-4.19.230 commit 0be943916d781df2b652793bb2d3ae4f9624c10a -------------------------------- [ Upstream commit 9eeabdf17fa0ab75381045c867c370f4cc75a613 ] When uncloning an skb dst and its associated metadata, a new dst+metadata is allocated and later replaces the old one in the skb. This is helpful to have a non-shared dst+metadata attached to a specific skb. The issue is the uncloned dst+metadata is initialized with a refcount of 1, which is increased to 2 before attaching it to the skb. When tun_dst_unclone returns, the dst+metadata is only referenced from a single place (the skb) while its refcount is 2. Its refcount will never drop to 0 (when the skb is consumed), leading to a memory leak. Fix this by removing the call to dst_hold in tun_dst_unclone, as the dst+metadata refcount is already 1. Fixes: fc4099f17240 ("openvswitch: Fix egress tunnel info.") Cc: Pravin B Shelar Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- include/net/dst_metadata.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index b997e0c1e362..adab27ba1ecb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -137,7 +137,6 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) #endif skb_dst_drop(skb); - dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); return new_md; } -- Gitee From c060bda80a7b459b7b92e12bb49bef0d1c7719ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:55:56 +0800 Subject: [PATCH 157/340] veth: fix races around rq->rx_notify_masked stable inclusion from linux-4.19.230 commit 8f0ea3777590c16222d4251a49e31dd376ff5ac1 -------------------------------- [ Upstream commit 68468d8c4cd4222a4ca1f185ab5a1c14480d078c ] veth being NETIF_F_LLTX enabled, we need to be more careful whenever we read/write rq->rx_notify_masked. BUG: KCSAN: data-race in veth_xmit / veth_xmit write to 0xffff888133d9a9f8 of 1 bytes by task 23552 on cpu 0: __veth_xdp_flush drivers/net/veth.c:269 [inline] veth_xmit+0x307/0x470 drivers/net/veth.c:350 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 br_dev_queue_push_xmit+0x3ce/0x430 net/bridge/br_forward.c:53 NF_HOOK include/linux/netfilter.h:307 [inline] br_forward_finish net/bridge/br_forward.c:66 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] __br_forward+0x2e4/0x400 net/bridge/br_forward.c:115 br_flood+0x521/0x5c0 net/bridge/br_forward.c:242 br_dev_xmit+0x8b6/0x960 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 neigh_hh_output include/net/neighbour.h:525 [inline] neigh_output include/net/neighbour.h:539 [inline] ip_finish_output2+0x6f8/0xb70 net/ipv4/ip_output.c:228 ip_finish_output+0xfb/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] ip_send_skb+0x6e/0xe0 net/ipv4/ip_output.c:1570 udp_send_skb+0x641/0x880 net/ipv4/udp.c:967 udp_sendmsg+0x12ea/0x14c0 net/ipv4/udp.c:1254 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888133d9a9f8 of 1 bytes by task 23563 on cpu 1: __veth_xdp_flush drivers/net/veth.c:268 [inline] veth_xmit+0x2d6/0x470 drivers/net/veth.c:350 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 br_dev_queue_push_xmit+0x3ce/0x430 net/bridge/br_forward.c:53 NF_HOOK include/linux/netfilter.h:307 [inline] br_forward_finish net/bridge/br_forward.c:66 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] __br_forward+0x2e4/0x400 net/bridge/br_forward.c:115 br_flood+0x521/0x5c0 net/bridge/br_forward.c:242 br_dev_xmit+0x8b6/0x960 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 neigh_hh_output include/net/neighbour.h:525 [inline] neigh_output include/net/neighbour.h:539 [inline] ip_finish_output2+0x6f8/0xb70 net/ipv4/ip_output.c:228 ip_finish_output+0xfb/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] ip_send_skb+0x6e/0xe0 net/ipv4/ip_output.c:1570 udp_send_skb+0x641/0x880 net/ipv4/udp.c:967 udp_sendmsg+0x12ea/0x14c0 net/ipv4/udp.c:1254 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 23563 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00064-gc36c04c2e132 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 948d4f214fde ("veth: Add driver XDP") Signed-off-by: Eric Dumazet Cc: Toshiaki Makita Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/net/veth.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 749faa6fcd82..ad098169d23f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -152,9 +152,10 @@ static void __veth_xdp_flush(struct veth_rq *rq) { /* Write ptr_ring before reading rx_notify_masked */ smp_mb(); - if (!rq->rx_notify_masked) { - rq->rx_notify_masked = true; - napi_schedule(&rq->xdp_napi); + if (!READ_ONCE(rq->rx_notify_masked) && + napi_schedule_prep(&rq->xdp_napi)) { + WRITE_ONCE(rq->rx_notify_masked, true); + __napi_schedule(&rq->xdp_napi); } } @@ -623,8 +624,10 @@ static int veth_poll(struct napi_struct *napi, int budget) /* Write rx_notify_masked before reading ptr_ring */ smp_store_mb(rq->rx_notify_masked, false); if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { - rq->rx_notify_masked = true; - napi_schedule(&rq->xdp_napi); + if (napi_schedule_prep(&rq->xdp_napi)) { + WRITE_ONCE(rq->rx_notify_masked, true); + __napi_schedule(&rq->xdp_napi); + } } } -- Gitee From b3195d2342748d56d5110a82ea0b6cced72c3d66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?TATSUKAWA=20KOSUKE=20=28=E7=AB=8B=E5=B7=9D=20=E6=B1=9F?= =?UTF-8?q?=E4=BB=8B=29?= Date: Tue, 12 Apr 2022 15:55:57 +0800 Subject: [PATCH 158/340] n_tty: wake up poll(POLLRDNORM) on receiving data stable inclusion from linux-4.19.230 commit 031ec2d7fabb5c01601490671666c55cf92e1b98 -------------------------------- commit c816b2e65b0e86b95011418cad334f0524fc33b8 upstream. The poll man page says POLLRDNORM is equivalent to POLLIN when used as an event. $ man poll POLLRDNORM Equivalent to POLLIN. However, in n_tty driver, POLLRDNORM does not return until timeout even if there is terminal input, whereas POLLIN returns. The following test program works until kernel-3.17, but the test stops in poll() after commit 57087d515441 ("tty: Fix spurious poll() wakeups"). [Steps to run test program] $ cc -o test-pollrdnorm test-pollrdnorm.c $ ./test-pollrdnorm foo <-- Type in something from the terminal followed by [RET]. The string should be echoed back. ------------------------< test-pollrdnorm.c >------------------------ #include #include #include #include void main(void) { int n; unsigned char buf[8]; struct pollfd fds[1] = {{ 0, POLLRDNORM, 0 }}; n = poll(fds, 1, -1); if (n < 0) perror("poll"); n = read(0, buf, 8); if (n < 0) perror("read"); if (n > 0) write(1, buf, n); } ------------------------------------------------------------------------ The attached patch fixes this problem. Many calls to wake_up_interruptible_poll() in the kernel source code already specify "POLLIN | POLLRDNORM". Fixes: 57087d515441 ("tty: Fix spurious poll() wakeups") Cc: stable@vger.kernel.org Signed-off-by: Kosuke Tatsukawa Link: https://lore.kernel.org/r/TYCPR01MB81901C0F932203D30E452B3EA5209@TYCPR01MB8190.jpnprd01.prod.outlook.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/n_tty.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 9369f5d2fdf5..a8d7ab7ee057 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1375,7 +1375,7 @@ n_tty_receive_char_special(struct tty_struct *tty, unsigned char c) put_tty_queue(c, ldata); smp_store_release(&ldata->canon_head, ldata->read_head); kill_fasync(&tty->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tty->read_wait, EPOLLIN); + wake_up_interruptible_poll(&tty->read_wait, EPOLLIN | EPOLLRDNORM); return 0; } } @@ -1656,7 +1656,7 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp, if (read_cnt(ldata)) { kill_fasync(&tty->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tty->read_wait, EPOLLIN); + wake_up_interruptible_poll(&tty->read_wait, EPOLLIN | EPOLLRDNORM); } } -- Gitee From e5d58f5ee731b2ff4e5c801cb875c804c63593e4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Apr 2022 15:55:58 +0800 Subject: [PATCH 159/340] seccomp: Invalidate seccomp mode to catch death failures stable inclusion from linux-4.19.230 commit 255264d81da6edaf4cd4fab836d1ef3ba09af6aa -------------------------------- commit 495ac3069a6235bfdf516812a2a9b256671bbdf9 upstream. If seccomp tries to kill a process, it should never see that process again. To enforce this proactively, switch the mode to something impossible. If encountered: WARN, reject all syscalls, and attempt to kill the process again even harder. Cc: Andy Lutomirski Cc: Will Drewry Fixes: 8112c4f140fa ("seccomp: remove 2-phase API") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/seccomp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fd023ac24e10..369b63ac7cf9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -28,6 +28,9 @@ #include #include +/* Not exposed in headers: strictly internal use only. */ +#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) + #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include #endif @@ -629,6 +632,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -743,6 +747,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_KILL_PROCESS: default: + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action == SECCOMP_RET_KILL_PROCESS || @@ -793,6 +798,11 @@ int __secure_computing(const struct seccomp_data *sd) return 0; case SECCOMP_MODE_FILTER: return __seccomp_filter(this_syscall, sd, false); + /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ + case SECCOMP_MODE_DEAD: + WARN_ON_ONCE(1); + do_exit(SIGKILL); + return -1; default: BUG(); } -- Gitee From 90dd4e1f09a0734c692d588d6c34045bfd509a35 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 12 Apr 2022 15:55:59 +0800 Subject: [PATCH 160/340] perf: Fix list corruption in perf_cgroup_switch() stable inclusion from linux-4.19.230 commit 30d9f3cbe47e1018ddc8069ac5b5c9e66fbdf727 -------------------------------- commit 5f4e5ce638e6a490b976ade4a40017b40abb2da0 upstream. There's list corruption on cgrp_cpuctx_list. This happens on the following path: perf_cgroup_switch: list_for_each_entry(cgrp_cpuctx_list) cpu_ctx_sched_in ctx_sched_in ctx_pinned_sched_in merge_sched_in perf_cgroup_event_disable: remove the event from the list Use list_for_each_entry_safe() to allow removing an entry during iteration. Fixes: 058fe1c0440e ("perf/core: Make cgroup switch visit only cpuctxs with cgroup events") Signed-off-by: Song Liu Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220204004057.2961252-1-song@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 00b22c820d1f..deba52307349 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -798,7 +798,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); */ static void perf_cgroup_switch(struct task_struct *task, int mode) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -809,7 +809,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_save(flags); list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); perf_ctx_lock(cpuctx, cpuctx->task_ctx); -- Gitee From fa03a2a696857c4eb819963f546bc07b2a480962 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Apr 2022 15:56:00 +0800 Subject: [PATCH 161/340] serial: parisc: GSC: fix build when IOSAPIC is not set stable inclusion from linux-4.19.231 commit 592b1b5ad3ee6cce1381111cfd69c6dc868050a3 -------------------------------- commit 6e8793674bb0d1135ca0e5c9f7e16fecbf815926 upstream. There is a build error when using a kernel .config file from 'kernel test robot' for a different build problem: hppa64-linux-ld: drivers/tty/serial/8250/8250_gsc.o: in function `.LC3': (.data.rel.ro+0x18): undefined reference to `iosapic_serial_irq' when: CONFIG_GSC=y CONFIG_SERIO_GSCPS2=y CONFIG_SERIAL_8250_GSC=y CONFIG_PCI is not set and hence PCI_LBA is not set. IOSAPIC depends on PCI_LBA, so IOSAPIC is not set/enabled. Make the use of iosapic_serial_irq() conditional to fix the build error. Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: linux-parisc@vger.kernel.org Cc: Greg Kroah-Hartman Cc: linux-serial@vger.kernel.org Cc: Jiri Slaby Cc: Johan Hovold Suggested-by: Helge Deller Signed-off-by: Helge Deller Cc: stable@vger.kernel.org Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/serial/8250/8250_gsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_gsc.c b/drivers/tty/serial/8250/8250_gsc.c index 0809ae2aa9b1..51cc985216ff 100644 --- a/drivers/tty/serial/8250/8250_gsc.c +++ b/drivers/tty/serial/8250/8250_gsc.c @@ -26,7 +26,7 @@ static int __init serial_init_chip(struct parisc_device *dev) unsigned long address; int err; -#ifdef CONFIG_64BIT +#if defined(CONFIG_64BIT) && defined(CONFIG_IOSAPIC) if (!dev->irq && (dev->id.sversion == 0xad)) dev->irq = iosapic_serial_irq(dev); #endif -- Gitee From 976db2f4b02c3205c99040cc49586e6d10e74463 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 12 Apr 2022 15:56:01 +0800 Subject: [PATCH 162/340] vfs: make freeze_super abort when sync_filesystem returns error stable inclusion from linux-4.19.231 commit d5c33270b8b2c274eb8df7b92d166166102528c6 -------------------------------- [ Upstream commit 2719c7160dcfaae1f73a1c0c210ad3281c19022e ] If we fail to synchronize the filesystem while preparing to freeze the fs, abort the freeze. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/super.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/super.c b/fs/super.c index 9fb4553c46e6..8dc26e23f1a3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1404,11 +1404,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +static void sb_freeze_unlock(struct super_block *sb, int level) { - int level; - - for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } @@ -1479,7 +1477,14 @@ int freeze_super(struct super_block *sb) sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ - sync_filesystem(sb); + ret = sync_filesystem(sb); + if (ret) { + sb->s_writers.frozen = SB_UNFROZEN; + sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; @@ -1491,7 +1496,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1542,7 +1547,7 @@ static int thaw_super_locked(struct super_block *sb) } sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); -- Gitee From 93ed5e1c52141d99b9f8557ec02014be7956eabc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 12 Apr 2022 15:56:02 +0800 Subject: [PATCH 163/340] quota: make dquot_quota_sync return errors from ->sync_fs stable inclusion from linux-4.19.231 commit a1a41571f06e2b66229738bd0c92c1d57dd793e2 -------------------------------- [ Upstream commit dd5532a4994bfda0386eb2286ec00758cee08444 ] Strangely, dquot_quota_sync ignores the return code from the ->sync_fs call, which means that quotacalls like Q_SYNC never see the error. This doesn't seem right, so fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/quota/dquot.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 1d1d393f4208..ddb379abd919 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -687,9 +687,14 @@ int dquot_quota_sync(struct super_block *sb, int type) /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } + ret = sync_blockdev(sb->s_bdev); + if (ret) + return ret; /* * Now when everything is written we can discard the pagecache so -- Gitee From 0173888a936ab577dfc98156c76901f91919e3d1 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 12 Apr 2022 15:56:03 +0800 Subject: [PATCH 164/340] nvme: fix a possible use-after-free in controller reset during load stable inclusion from linux-4.19.231 commit a25e460fbb0340488d119fb2e28fe3f829b7417e -------------------------------- [ Upstream commit 0fa0f99fc84e41057cbdd2efbfe91c6b2f47dd9d ] Unlike .queue_rq, in .submit_async_event drivers may not check the ctrl readiness for AER submission. This may lead to a use-after-free condition that was observed with nvme-tcp. The race condition may happen in the following scenario: 1. driver executes its reset_ctrl_work 2. -> nvme_stop_ctrl - flushes ctrl async_event_work 3. ctrl sends AEN which is received by the host, which in turn schedules AEN handling 4. teardown admin queue (which releases the queue socket) 5. AEN processed, submits another AER, calling the driver to submit 6. driver attempts to send the cmd ==> use-after-free In order to fix that, add ctrl state check to validate the ctrl is actually able to accept the AER submission. This addresses the above race in controller resets because the driver during teardown should: 1. change ctrl state to RESETTING 2. flush async_event_work (as well as other async work elements) So after 1,2, any other AER command will find the ctrl state to be RESETTING and bail out without submitting the AER. Signed-off-by: Sagi Grimberg Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/nvme/host/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index feba1a34c15e..164ec45dac84 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3683,7 +3683,14 @@ static void nvme_async_event_work(struct work_struct *work) container_of(work, struct nvme_ctrl, async_event_work); nvme_aen_uevent(ctrl); - ctrl->ops->submit_async_event(ctrl); + + /* + * The transport drivers must guarantee AER submission here is safe by + * flushing ctrl async_event_work after changing the controller state + * from LIVE and before freeing the admin queue. + */ + if (ctrl->state == NVME_CTRL_LIVE) + ctrl->ops->submit_async_event(ctrl); } static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) -- Gitee From 9f4c7283e5e436a211e9e98f87ff0e34f1efb347 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 12 Apr 2022 15:56:04 +0800 Subject: [PATCH 165/340] xfrm: Don't accidentally set RTO_ONLINK in decode_session4() stable inclusion from linux-4.19.231 commit bbf7a1a2fc64d89896ba5eba494a40ca151a2675 -------------------------------- commit 23e7b1bfed61e301853b5e35472820d919498278 upstream. Similar to commit 94e2238969e8 ("xfrm4: strip ECN bits from tos field"), clear the ECN bits from iph->tos when setting ->flowi4_tos. This ensures that the last bit of ->flowi4_tos is cleared, so ip_route_output_key_hash() isn't going to restrict the scope of the route lookup. Use ~INET_ECN_MASK instead of IPTOS_RT_MASK, because we have no reason to clear the high order bits. Found by code inspection, compile tested only. Fixes: 4da3089f2b58 ("[IPSEC]: Use TOS when doing tunnel lookups") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski [sudip: manually backport to previous location] Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/xfrm4_policy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 1e5e2e4be0b2..e85b5f57d3e9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -17,6 +17,7 @@ #include #include #include +#include static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, @@ -126,7 +127,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) fl4->flowi4_proto = iph->protocol; fl4->daddr = reverse ? iph->saddr : iph->daddr; fl4->saddr = reverse ? iph->daddr : iph->saddr; - fl4->flowi4_tos = iph->tos; + fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK; if (!ip_is_fragment(iph)) { switch (iph->protocol) { -- Gitee From b67cb551884fb51136d0bf28dc1716aab90c725b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 12 Apr 2022 15:56:05 +0800 Subject: [PATCH 166/340] taskstats: Cleanup the use of task->exit_code stable inclusion from linux-4.19.231 commit c6055df602b9e9ed9427b7bb9088c75cb5cf6350 -------------------------------- commit 1b5a42d9c85f0e731f01c8d1129001fd8531a8a0 upstream. In the function bacct_add_task the code reading task->exit_code was introduced in commit f3cef7a99469 ("[PATCH] csa: basic accounting over taskstats"), and it is not entirely clear what the taskstats interface is trying to return as only returning the exit_code of the first task in a process doesn't make a lot of sense. As best as I can figure the intent is to return task->exit_code after a task exits. The field is returned with per task fields, so the exit_code of the entire process is not wanted. Only the value of the first task is returned so this is not a useful way to get the per task ptrace stop code. The ordinary case of returning this value is returning after a task exits, which also precludes use for getting a ptrace value. It is common to for the first task of a process to also be the last task of a process so this field may have done something reasonable by accident in testing. Make ac_exitcode a reliable per task value by always returning it for every exited task. Setting ac_exitcode in a sensible mannter makes it possible to continue to provide this value going forward. Cc: Balbir Singh Fixes: f3cef7a99469 ("[PATCH] csa: basic accounting over taskstats") Link: https://lkml.kernel.org/r/20220103213312.9144-5-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" [sudip: adjust context] Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/tsacct.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 370724b45391..8d7c6d3f1daa 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -46,11 +46,10 @@ void bacct_add_tsk(struct user_namespace *user_ns, /* Convert to seconds for btime */ do_div(delta, USEC_PER_SEC); stats->ac_btime = get_seconds() - delta; - if (thread_group_leader(tsk)) { + if (tsk->flags & PF_EXITING) stats->ac_exitcode = tsk->exit_code; - if (tsk->flags & PF_FORKNOEXEC) - stats->ac_flag |= AFORK; - } + if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC)) + stats->ac_flag |= AFORK; if (tsk->flags & PF_SUPERPRIV) stats->ac_flag |= ASU; if (tsk->flags & PF_DUMPCORE) -- Gitee From 8f07718445d2d571a85c539a7f04c6b77ee8f5d9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 12 Apr 2022 15:56:06 +0800 Subject: [PATCH 167/340] ping: fix the dif and sdif check in ping_lookup stable inclusion from linux-4.19.231 commit b0e55a57df7deed5f0682b584e4da913db2f6e84 -------------------------------- commit 35a79e64de29e8d57a5989aac57611c0cd29e13e upstream. When 'ping' changes to use PING socket instead of RAW socket by: # sysctl -w net.ipv4.ping_group_range="0 100" There is another regression caused when matching sk_bound_dev_if and dif, RAW socket is using inet_iif() while PING socket lookup is using skb->dev->ifindex, the cmd below fails due to this: # ip link add dummy0 type dummy # ip link set dummy0 up # ip addr add 192.168.111.1/24 dev dummy0 # ping -I dummy0 192.168.111.1 -c1 The issue was also reported on: https://github.com/iputils/iputils/issues/104 But fixed in iputils in a wrong way by not binding to device when destination IP is on device, and it will cause some of kselftests to fail, as Jianlin noticed. This patch is to use inet(6)_iif and inet(6)_sdif to get dif and sdif for PING socket, and keep consistent with RAW socket. Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/ping.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 276442443d32..28d171e6e10b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -177,16 +177,23 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) struct sock *sk = NULL; struct inet_sock *isk; struct hlist_nulls_node *hnode; - int dif = skb->dev->ifindex; + int dif, sdif; if (skb->protocol == htons(ETH_P_IP)) { + dif = inet_iif(skb); + sdif = inet_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", (int)ident, &ip_hdr(skb)->daddr, dif); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { + dif = inet6_iif(skb); + sdif = inet6_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", (int)ident, &ipv6_hdr(skb)->daddr, dif); #endif + } else { + pr_err("ping: protocol(%x) is not supported\n", ntohs(skb->protocol)); + return NULL; } read_lock_bh(&ping_table.lock); @@ -226,7 +233,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != inet_sdif(skb)) + sk->sk_bound_dev_if != sdif) continue; sock_hold(sk); -- Gitee From 452fc6b6c1221c7d2d103f6c81a01aaacefc1b1e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:56:07 +0800 Subject: [PATCH 168/340] drop_monitor: fix data-race in dropmon_net_event / trace_napi_poll_hit stable inclusion from linux-4.19.231 commit e294bc65746b779e4ad50f95ed04926cf72d1454 -------------------------------- commit dcd54265c8bc14bd023815e36e2d5f9d66ee1fee upstream. trace_napi_poll_hit() is reading stat->dev while another thread can write on it from dropmon_net_event() Use READ_ONCE()/WRITE_ONCE() here, RCU rules are properly enforced already, we only have to take care of load/store tearing. BUG: KCSAN: data-race in dropmon_net_event / trace_napi_poll_hit write to 0xffff88816f3ab9c0 of 8 bytes by task 20260 on cpu 1: dropmon_net_event+0xb8/0x2b0 net/core/drop_monitor.c:1579 notifier_call_chain kernel/notifier.c:84 [inline] raw_notifier_call_chain+0x53/0xb0 kernel/notifier.c:392 call_netdevice_notifiers_info net/core/dev.c:1919 [inline] call_netdevice_notifiers_extack net/core/dev.c:1931 [inline] call_netdevice_notifiers net/core/dev.c:1945 [inline] unregister_netdevice_many+0x867/0xfb0 net/core/dev.c:10415 ip_tunnel_delete_nets+0x24a/0x280 net/ipv4/ip_tunnel.c:1123 vti_exit_batch_net+0x2a/0x30 net/ipv4/ip_vti.c:515 ops_exit_list net/core/net_namespace.c:173 [inline] cleanup_net+0x4dc/0x8d0 net/core/net_namespace.c:597 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 read to 0xffff88816f3ab9c0 of 8 bytes by interrupt on cpu 0: trace_napi_poll_hit+0x89/0x1c0 net/core/drop_monitor.c:292 trace_napi_poll include/trace/events/napi.h:14 [inline] __napi_poll+0x36b/0x3f0 net/core/dev.c:6366 napi_poll net/core/dev.c:6432 [inline] net_rx_action+0x29e/0x650 net/core/dev.c:6519 __do_softirq+0x158/0x2de kernel/softirq.c:558 do_softirq+0xb1/0xf0 kernel/softirq.c:459 __local_bh_enable_ip+0x68/0x70 kernel/softirq.c:383 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:167 [inline] _raw_spin_unlock_bh+0x33/0x40 kernel/locking/spinlock.c:210 spin_unlock_bh include/linux/spinlock.h:394 [inline] ptr_ring_consume_bh include/linux/ptr_ring.h:367 [inline] wg_packet_decrypt_worker+0x73c/0x780 drivers/net/wireguard/receive.c:506 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 value changed: 0xffff88815883e000 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 26435 Comm: kworker/0:1 Not tainted 5.17.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: wg-crypt-wg2 wg_packet_decrypt_worker Fixes: 4ea7e38696c7 ("dropmon: add ability to detect when hardware dropsrxpackets") Signed-off-by: Eric Dumazet Cc: Neil Horman Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/core/drop_monitor.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 3978a5e8d261..2ed600012640 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -219,13 +219,17 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, rcu_read_lock(); list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + struct net_device *dev; + /* * only add a note to our monitor buffer if: * 1) this is the dev we received on * 2) its after the last_rx delta * 3) our rx_dropped count has gone up */ - if ((new_stat->dev == napi->dev) && + /* Paired with WRITE_ONCE() in dropmon_net_event() */ + dev = READ_ONCE(new_stat->dev); + if ((dev == napi->dev) && (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { trace_drop_common(NULL, NULL); @@ -340,7 +344,10 @@ static int dropmon_net_event(struct notifier_block *ev_block, mutex_lock(&trace_state_mutex); list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { if (new_stat->dev == dev) { - new_stat->dev = NULL; + + /* Paired with READ_ONCE() in trace_napi_poll_hit() */ + WRITE_ONCE(new_stat->dev, NULL); + if (trace_state == TRACE_OFF) { list_del_rcu(&new_stat->list); kfree_rcu(new_stat, rcu); -- Gitee From 68f74ceb415961ef2270751a1e9bee990752a9f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:56:08 +0800 Subject: [PATCH 169/340] bonding: fix data-races around agg_select_timer stable inclusion from linux-4.19.231 commit 4218e6995c19970aa7b32914be6c8e059837cbdf -------------------------------- commit 9ceaf6f76b203682bb6100e14b3d7da4c0bedde8 upstream. syzbot reported that two threads might write over agg_select_timer at the same time. Make agg_select_timer atomic to fix the races. BUG: KCSAN: data-race in bond_3ad_initiate_agg_selection / bond_3ad_state_machine_handler read to 0xffff8881242aea90 of 4 bytes by task 1846 on cpu 1: bond_3ad_state_machine_handler+0x99/0x2810 drivers/net/bonding/bond_3ad.c:2317 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 write to 0xffff8881242aea90 of 4 bytes by task 25910 on cpu 0: bond_3ad_initiate_agg_selection+0x18/0x30 drivers/net/bonding/bond_3ad.c:1998 bond_open+0x658/0x6f0 drivers/net/bonding/bond_main.c:3967 __dev_open+0x274/0x3a0 net/core/dev.c:1407 dev_open+0x54/0x190 net/core/dev.c:1443 bond_enslave+0xcef/0x3000 drivers/net/bonding/bond_main.c:1937 do_set_master net/core/rtnetlink.c:2532 [inline] do_setlink+0x94f/0x2500 net/core/rtnetlink.c:2736 __rtnl_newlink net/core/rtnetlink.c:3414 [inline] rtnl_newlink+0xfeb/0x13e0 net/core/rtnetlink.c:3529 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5594 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000050 -> 0x0000004f Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 25910 Comm: syz-executor.1 Tainted: G W 5.17.0-rc4-syzkaller-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jay Vosburgh Cc: Veaceslav Falico Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/net/bonding/bond_3ad.c | 30 +++++++++++++++++++++++++----- include/net/bond_3ad.h | 2 +- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 035923876c61..e3f814e83d9c 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -249,7 +249,7 @@ static inline int __check_agg_selection_timer(struct port *port) if (bond == NULL) return 0; - return BOND_AD_INFO(bond).agg_select_timer ? 1 : 0; + return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** @@ -1965,7 +1965,7 @@ static void ad_marker_response_received(struct bond_marker *marker, */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { - BOND_AD_INFO(bond).agg_select_timer = timeout; + atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** @@ -2249,6 +2249,28 @@ void bond_3ad_update_ad_actor_settings(struct bonding *bond) spin_unlock_bh(&bond->mode_lock); } +/** + * bond_agg_timer_advance - advance agg_select_timer + * @bond: bonding structure + * + * Return true when agg_select_timer reaches 0. + */ +static bool bond_agg_timer_advance(struct bonding *bond) +{ + int val, nval; + + while (1) { + val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); + if (!val) + return false; + nval = val - 1; + if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, + val, nval) == val) + break; + } + return nval == 0; +} + /** * bond_3ad_state_machine_handler - handle state machines timeout * @bond: bonding struct to work on @@ -2284,9 +2306,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) if (!bond_has_slaves(bond)) goto re_arm; - /* check if agg_select_timer timer after initialize is timed out */ - if (BOND_AD_INFO(bond).agg_select_timer && - !(--BOND_AD_INFO(bond).agg_select_timer)) { + if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index fc3111515f5c..732bc3b4606b 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -265,7 +265,7 @@ struct ad_system { struct ad_bond_info { struct ad_system system; /* 802.3ad system structure */ - u32 agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ + atomic_t agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ u16 aggregator_identifier; }; -- Gitee From 386778705e50f8b11ae4d1f8d85ca048420b8fb6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Apr 2022 15:56:09 +0800 Subject: [PATCH 170/340] NFS: LOOKUP_DIRECTORY is also ok with symlinks stable inclusion from linux-4.19.231 commit 67552482aee7a139ed957595db8668d28ddc2c42 -------------------------------- commit e0caaf75d443e02e55e146fd75fe2efc8aed5540 upstream. Commit ac795161c936 (NFSv4: Handle case where the lookup of a directory fails) [1], part of Linux since 5.17-rc2, introduced a regression, where a symbolic link on an NFS mount to a directory on another NFS does not resolve(?) the first time it is accessed: Reported-by: Paul Menzel Fixes: ac795161c936 ("NFSv4: Handle case where the lookup of a directory fails") Signed-off-by: Trond Myklebust Tested-by: Donald Buczek Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 72e5ae8cc4ff..2ec602eccb80 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1640,14 +1640,14 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, if (!res) { inode = d_inode(dentry); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && - !S_ISDIR(inode->i_mode)) + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) res = ERR_PTR(-ENOTDIR); else if (inode && S_ISREG(inode->i_mode)) res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && - !S_ISDIR(inode->i_mode)) { + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { dput(res); res = ERR_PTR(-ENOTDIR); } else if (inode && S_ISREG(inode->i_mode)) { -- Gitee From e488b6784f338947b60eac630caee5f995f67a16 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Apr 2022 15:56:10 +0800 Subject: [PATCH 171/340] NFS: Do not report writeback errors in nfs_getattr() stable inclusion from linux-4.19.231 commit d2ba21f271eb167ac2b0581598e55222b89f0f32 -------------------------------- commit d19e0183a88306acda07f4a01fedeeffe2a2a06b upstream. The result of the writeback, whether it is an ENOSPC or an EIO, or anything else, does not inhibit the NFS client from reporting the correct file timestamps. Fixes: 79566ef018f5 ("NFS: Getattr doesn't require data sync semantics") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/nfs/inode.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6d5fedcd30d0..db52430dfbd7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -787,12 +787,9 @@ int nfs_getattr(const struct path *path, struct kstat *stat, goto out_no_update; /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME|STATX_MTIME)) && - S_ISREG(inode->i_mode)) { - err = filemap_write_and_wait(inode->i_mapping); - if (err) - goto out; - } + if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); /* * We may force a getattr if the user cares about atime. -- Gitee From 6ff4ccebf4a5346a100ef89ae44a23300f0d534b Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Tue, 12 Apr 2022 15:56:11 +0800 Subject: [PATCH 172/340] mtd: rawnand: qcom: Fix clock sequencing in qcom_nandc_probe() stable inclusion from linux-4.19.231 commit 891484112f4ffd0b576e88407bbd3653abf3faba -------------------------------- commit 5c23b3f965bc9ee696bf2ed4bdc54d339dd9a455 upstream. Interacting with a NAND chip on an IPQ6018 I found that the qcomsmem NAND partition parser was returning -EPROBE_DEFER waiting for the main smem driver to load. This caused the board to reset. Playing about with the probe() function shows that the problem lies in the core clock being switched off before the nandc_unalloc() routine has completed. If we look at how qcom_nandc_remove() tears down allocated resources we see the expected order is qcom_nandc_unalloc(nandc); clk_disable_unprepare(nandc->aon_clk); clk_disable_unprepare(nandc->core_clk); dma_unmap_resource(&pdev->dev, nandc->base_dma, resource_size(res), DMA_BIDIRECTIONAL, 0); Tweaking probe() to both bring up and tear-down in that order removes the reset if we end up deferring elsewhere. Fixes: c76b78d8ec05 ("mtd: nand: Qualcomm NAND controller driver") Signed-off-by: Bryan O'Donoghue Reviewed-by: Manivannan Sadhasivam Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220103030316.58301-2-bryan.odonoghue@linaro.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/mtd/nand/raw/qcom_nandc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index 07d8750313fd..0e4be6814fc7 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -10,7 +10,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - #include #include #include @@ -2957,10 +2956,6 @@ static int qcom_nandc_probe(struct platform_device *pdev) if (!nandc->base_dma) return -ENXIO; - ret = qcom_nandc_alloc(nandc); - if (ret) - goto err_nandc_alloc; - ret = clk_prepare_enable(nandc->core_clk); if (ret) goto err_core_clk; @@ -2969,6 +2964,10 @@ static int qcom_nandc_probe(struct platform_device *pdev) if (ret) goto err_aon_clk; + ret = qcom_nandc_alloc(nandc); + if (ret) + goto err_nandc_alloc; + ret = qcom_nandc_setup(nandc); if (ret) goto err_setup; @@ -2980,15 +2979,14 @@ static int qcom_nandc_probe(struct platform_device *pdev) return 0; err_setup: + qcom_nandc_unalloc(nandc); +err_nandc_alloc: clk_disable_unprepare(nandc->aon_clk); err_aon_clk: clk_disable_unprepare(nandc->core_clk); err_core_clk: - qcom_nandc_unalloc(nandc); -err_nandc_alloc: dma_unmap_resource(dev, res->start, resource_size(res), DMA_BIDIRECTIONAL, 0); - return ret; } -- Gitee From b4a48e80b131d001c3981eae7c05c1dbcfec720d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:56:12 +0800 Subject: [PATCH 173/340] net: sched: limit TC_ACT_REPEAT loops stable inclusion from linux-4.19.231 commit f63c9fa36bd7548fd07792127057cccb68a7e274 -------------------------------- commit 5740d068909676d4bdb5c9c00c37a83df7728909 upstream. We have been living dangerously, at the mercy of malicious users, abusing TC_ACT_REPEAT, as shown by this syzpot report [1]. Add an arbitrary limit (32) to the number of times an action can return TC_ACT_REPEAT. v2: switch the limit to 32 instead of 10. Use net_warn_ratelimited() instead of pr_err_once(). [1] (C repro available on demand) rcu: INFO: rcu_preempt self-detected stall on CPU rcu: 1-...!: (10500 ticks this GP) idle=021/1/0x4000000000000000 softirq=5592/5592 fqs=0 (t=10502 jiffies g=5305 q=190) rcu: rcu_preempt kthread timer wakeup didn't happen for 10502 jiffies! g5305 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 rcu: Possible timer handling issue on cpu=0 timer-softirq=3527 rcu: rcu_preempt kthread starved for 10505 jiffies! g5305 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=0 rcu: Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior. rcu: RCU grace-period kthread stack dump: task:rcu_preempt state:I stack:29344 pid: 14 ppid: 2 flags:0x00004000 Call Trace: context_switch kernel/sched/core.c:4986 [inline] __schedule+0xab2/0x4db0 kernel/sched/core.c:6295 schedule+0xd2/0x260 kernel/sched/core.c:6368 schedule_timeout+0x14a/0x2a0 kernel/time/timer.c:1881 rcu_gp_fqs_loop+0x186/0x810 kernel/rcu/tree.c:1963 rcu_gp_kthread+0x1de/0x320 kernel/rcu/tree.c:2136 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 rcu: Stack dump where RCU GP kthread last ran: Sending NMI from CPU 1 to CPUs 0: NMI backtrace for cpu 0 CPU: 0 PID: 3646 Comm: syz-executor358 Not tainted 5.17.0-rc3-syzkaller-00149-gbf8e59fd315f #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:rep_nop arch/x86/include/asm/vdso/processor.h:13 [inline] RIP: 0010:cpu_relax arch/x86/include/asm/vdso/processor.h:18 [inline] RIP: 0010:pv_wait_head_or_lock kernel/locking/qspinlock_paravirt.h:437 [inline] RIP: 0010:__pv_queued_spin_lock_slowpath+0x3b8/0xb40 kernel/locking/qspinlock.c:508 Code: 48 89 eb c6 45 01 01 41 bc 00 80 00 00 48 c1 e9 03 83 e3 07 41 be 01 00 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8d 2c 01 eb 0c 90 41 83 ec 01 0f 84 72 04 00 00 41 0f b6 45 00 38 d8 7f 08 84 RSP: 0018:ffffc9000283f1b0 EFLAGS: 00000206 RAX: 0000000000000003 RBX: 0000000000000000 RCX: 1ffff1100fc0071e RDX: 0000000000000001 RSI: 0000000000000201 RDI: 0000000000000000 RBP: ffff88807e0038f0 R08: 0000000000000001 R09: ffffffff8ffbf9ff R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000004c1e R13: ffffed100fc0071e R14: 0000000000000001 R15: ffff8880b9c3aa80 FS: 00005555562bf300(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffdbfef12b8 CR3: 00000000723c2000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: pv_queued_spin_lock_slowpath arch/x86/include/asm/paravirt.h:591 [inline] queued_spin_lock_slowpath arch/x86/include/asm/qspinlock.h:51 [inline] queued_spin_lock include/asm-generic/qspinlock.h:85 [inline] do_raw_spin_lock+0x200/0x2b0 kernel/locking/spinlock_debug.c:115 spin_lock_bh include/linux/spinlock.h:354 [inline] sch_tree_lock include/net/sch_generic.h:610 [inline] sch_tree_lock include/net/sch_generic.h:605 [inline] prio_tune+0x3b9/0xb50 net/sched/sch_prio.c:211 prio_init+0x5c/0x80 net/sched/sch_prio.c:244 qdisc_create.constprop.0+0x44a/0x10f0 net/sched/sch_api.c:1253 tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660 rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5594 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2496 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f7ee98aae99 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 41 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffdbfef12d8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007ffdbfef1300 RCX: 00007f7ee98aae99 RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 000000000000000d R09: 000000000000000d R10: 000000000000000d R11: 0000000000000246 R12: 00007ffdbfef12f0 R13: 00000000000f4240 R14: 000000000004ca47 R15: 00007ffdbfef12e4 INFO: NMI handler (nmi_cpu_backtrace_handler) took too long to run: 2.293 msecs NMI backtrace for cpu 1 CPU: 1 PID: 3260 Comm: kworker/1:3 Not tainted 5.17.0-rc3-syzkaller-00149-gbf8e59fd315f #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 nmi_cpu_backtrace.cold+0x47/0x144 lib/nmi_backtrace.c:111 nmi_trigger_cpumask_backtrace+0x1b3/0x230 lib/nmi_backtrace.c:62 trigger_single_cpu_backtrace include/linux/nmi.h:164 [inline] rcu_dump_cpu_stacks+0x25e/0x3f0 kernel/rcu/tree_stall.h:343 print_cpu_stall kernel/rcu/tree_stall.h:604 [inline] check_cpu_stall kernel/rcu/tree_stall.h:688 [inline] rcu_pending kernel/rcu/tree.c:3919 [inline] rcu_sched_clock_irq.cold+0x5c/0x759 kernel/rcu/tree.c:2617 update_process_times+0x16d/0x200 kernel/time/timer.c:1785 tick_sched_handle+0x9b/0x180 kernel/time/tick-sched.c:226 tick_sched_timer+0x1b0/0x2d0 kernel/time/tick-sched.c:1428 __run_hrtimer kernel/time/hrtimer.c:1685 [inline] __hrtimer_run_queues+0x1c0/0xe50 kernel/time/hrtimer.c:1749 hrtimer_interrupt+0x31c/0x790 kernel/time/hrtimer.c:1811 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1086 [inline] __sysvec_apic_timer_interrupt+0x146/0x530 arch/x86/kernel/apic/apic.c:1103 sysvec_apic_timer_interrupt+0x8e/0xc0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 RIP: 0010:__sanitizer_cov_trace_const_cmp4+0xc/0x70 kernel/kcov.c:286 Code: 00 00 00 48 89 7c 30 e8 48 89 4c 30 f0 4c 89 54 d8 20 48 89 10 5b c3 0f 1f 80 00 00 00 00 41 89 f8 bf 03 00 00 00 4c 8b 14 24 <89> f1 65 48 8b 34 25 00 70 02 00 e8 14 f9 ff ff 84 c0 74 4b 48 8b RSP: 0018:ffffc90002c5eea8 EFLAGS: 00000246 RAX: 0000000000000007 RBX: ffff88801c625800 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 RBP: ffff8880137d3100 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff874fcd88 R11: 0000000000000000 R12: ffff88801d692dc0 R13: ffff8880137d3104 R14: 0000000000000000 R15: ffff88801d692de8 tcf_police_act+0x358/0x11d0 net/sched/act_police.c:256 tcf_action_exec net/sched/act_api.c:1049 [inline] tcf_action_exec+0x1a6/0x530 net/sched/act_api.c:1026 tcf_exts_exec include/net/pkt_cls.h:326 [inline] route4_classify+0xef0/0x1400 net/sched/cls_route.c:179 __tcf_classify net/sched/cls_api.c:1549 [inline] tcf_classify+0x3e8/0x9d0 net/sched/cls_api.c:1615 prio_classify net/sched/sch_prio.c:42 [inline] prio_enqueue+0x3a7/0x790 net/sched/sch_prio.c:75 dev_qdisc_enqueue+0x40/0x300 net/core/dev.c:3668 __dev_xmit_skb net/core/dev.c:3756 [inline] __dev_queue_xmit+0x1f61/0x3660 net/core/dev.c:4081 neigh_hh_output include/net/neighbour.h:533 [inline] neigh_output include/net/neighbour.h:547 [inline] ip_finish_output2+0x14dc/0x2170 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:306 [inline] __ip_finish_output+0x396/0x650 net/ipv4/ip_output.c:288 ip_finish_output+0x32/0x200 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0x196/0x310 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out+0xaf/0x1a0 net/ipv4/ip_output.c:126 iptunnel_xmit+0x628/0xa50 net/ipv4/ip_tunnel_core.c:82 geneve_xmit_skb drivers/net/geneve.c:966 [inline] geneve_xmit+0x10c8/0x3530 drivers/net/geneve.c:1077 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one net/core/dev.c:3473 [inline] dev_hard_start_xmit+0x1eb/0x920 net/core/dev.c:3489 __dev_queue_xmit+0x2985/0x3660 net/core/dev.c:4116 neigh_hh_output include/net/neighbour.h:533 [inline] neigh_output include/net/neighbour.h:547 [inline] ip6_finish_output2+0xf7a/0x14f0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] __ip6_finish_output+0x61e/0xe90 net/ipv6/ip6_output.c:170 ip6_finish_output+0x32/0x200 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:451 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] mld_sendpack+0x9a3/0xe40 net/ipv6/mcast.c:1826 mld_send_cr net/ipv6/mcast.c:2127 [inline] mld_ifc_work+0x71c/0xdc0 net/ipv6/mcast.c:2659 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307 worker_thread+0x657/0x1110 kernel/workqueue.c:2454 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 ---------------- Code disassembly (best guess): 0: 48 89 eb mov %rbp,%rbx 3: c6 45 01 01 movb $0x1,0x1(%rbp) 7: 41 bc 00 80 00 00 mov $0x8000,%r12d d: 48 c1 e9 03 shr $0x3,%rcx 11: 83 e3 07 and $0x7,%ebx 14: 41 be 01 00 00 00 mov $0x1,%r14d 1a: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax 21: fc ff df 24: 4c 8d 2c 01 lea (%rcx,%rax,1),%r13 28: eb 0c jmp 0x36 * 2a: f3 90 pause <-- trapping instruction 2c: 41 83 ec 01 sub $0x1,%r12d 30: 0f 84 72 04 00 00 je 0x4a8 36: 41 0f b6 45 00 movzbl 0x0(%r13),%eax 3b: 38 d8 cmp %bl,%al 3d: 7f 08 jg 0x47 3f: 84 .byte 0x84 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Acked-by: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Reported-by: syzbot Link: https://lore.kernel.org/r/20220215235305.3272331-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/sched/act_api.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f2c4bfc79663..26851ca5566a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -609,15 +609,24 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, restart_act_graph: for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; + int repeat_ttl; if (jmp_prgcnt > 0) { jmp_prgcnt -= 1; continue; } + + repeat_ttl = 32; repeat: ret = a->ops->act(skb, a, res); - if (ret == TC_ACT_REPEAT) - goto repeat; /* we need a ttl - JHS */ + + if (unlikely(ret == TC_ACT_REPEAT)) { + if (--repeat_ttl != 0) + goto repeat; + /* suspicious opcode, stop pipeline */ + net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n"); + return TC_ACT_OK; + } if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; -- Gitee From 397e43dac979a6fbb6d358a462a4df516e4406f4 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 12 Apr 2022 15:56:13 +0800 Subject: [PATCH 174/340] dmaengine: sh: rcar-dmac: Check for error num after setting mask stable inclusion from linux-4.19.231 commit 783d70c94e513ca715643c00c6c275d9eb3b1a9e -------------------------------- commit 2d21543efe332cd8c8f212fb7d365bc8b0690bfa upstream. Because of the possible failure of the dma_supported(), the dma_set_mask_and_coherent() may return error num. Therefore, it should be better to check it and return the error if fails. Fixes: dc312349e875 ("dmaengine: rcar-dmac: Widen DMA mask to 40 bits") Signed-off-by: Jiasheng Jiang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220106030939.2644320-1-jiasheng@iscas.ac.cn Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/dma/sh/rcar-dmac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c index 80ff95f75199..29c51762336d 100644 --- a/drivers/dma/sh/rcar-dmac.c +++ b/drivers/dma/sh/rcar-dmac.c @@ -1817,7 +1817,9 @@ static int rcar_dmac_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dmac); dmac->dev->dma_parms = &dmac->parms; dma_set_max_seg_size(dmac->dev, RCAR_DMATCR_MASK); - dma_set_mask_and_coherent(dmac->dev, DMA_BIT_MASK(40)); + ret = dma_set_mask_and_coherent(dmac->dev, DMA_BIT_MASK(40)); + if (ret) + return ret; ret = rcar_dmac_parse_of(&pdev->dev, dmac); if (ret < 0) -- Gitee From 8dff553c6320897875ff3edb77540ebcbac5aaeb Mon Sep 17 00:00:00 2001 From: JaeSang Yoo Date: Tue, 12 Apr 2022 15:56:14 +0800 Subject: [PATCH 175/340] tracing: Fix tp_printk option related with tp_printk_stop_on_boot stable inclusion from linux-4.19.231 commit 8f8c9e71e192823e4d76fdc53b4391642291bafc -------------------------------- [ Upstream commit 3203ce39ac0b2a57a84382ec184c7d4a0bede175 ] The kernel parameter "tp_printk_stop_on_boot" starts with "tp_printk" which is the same as another kernel parameter "tp_printk". If "tp_printk" setup is called before the "tp_printk_stop_on_boot", it will override the latter and keep it from being set. This is similar to other kernel parameter issues, such as: Commit 745a600cf1a6 ("um: console: Ignore console= option") or init/do_mounts.c:45 (setup function of "ro" kernel param) Fix it by checking for a "_" right after the "tp_printk" and if that exists do not process the parameter. Link: https://lkml.kernel.org/r/20220208195421.969326-1-jsyoo5b@gmail.com Signed-off-by: JaeSang Yoo [ Fixed up change log and added space after if condition ] Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 119dd5fd5840..feed30698280 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -232,6 +232,10 @@ __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { + /* Ignore the "tp_printk_stop_on_boot" param */ + if (*str == '_') + return 0; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; -- Gitee From 6d17be8f94fef455d465b2b17187e0621a457713 Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Tue, 12 Apr 2022 15:56:15 +0800 Subject: [PATCH 176/340] cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from linux-4.19.232 commit 4eec5fe1c680a6c47a9bc0cde00960a4eb663342 -------------------------------- commit 05c7b7a92cc87ff8d7fde189d0fade250697573c upstream. As previously discussed(https://lkml.org/lkml/2022/1/20/51), cpuset_attach() is affected with similar cpu hotplug race, as follow scenario: cpuset_attach() cpu hotplug --------------------------- ---------------------- down_write(cpuset_rwsem) guarantee_online_cpus() // (load cpus_attach) sched_cpu_deactivate set_cpu_active() // will change cpu_active_mask set_cpus_allowed_ptr(cpus_attach) __set_cpus_allowed_ptr_locked() // (if the intersection of cpus_attach and cpu_active_mask is empty, will return -EINVAL) up_write(cpuset_rwsem) To avoid races such as described above, protect cpuset_attach() call with cpu_hotplug_lock. Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time") Cc: stable@vger.kernel.org # v2.6.32+ Reported-by: Zhao Gongyi Signed-off-by: Zhang Qiao Acked-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- kernel/cgroup/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index feb91177247c..27dd33566df8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1530,6 +1530,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); + cpus_read_lock(); mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1585,6 +1586,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ -- Gitee From 220832c543ad2c75745fc87f046df5b0bc769f8b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Apr 2022 15:56:16 +0800 Subject: [PATCH 177/340] net: __pskb_pull_tail() & pskb_carve_frag_list() drop_monitor friends stable inclusion from linux-4.19.232 commit 1f4ae0f158dafa74133108bfa07b8053eb2a7898 -------------------------------- commit ef527f968ae05c6717c39f49c8709a7e2c19183a upstream. Whenever one of these functions pull all data from an skb in a frag_list, use consume_skb() instead of kfree_skb() to avoid polluting drop monitoring. Fixes: 6fa01ccd8830 ("skbuff: Add pskb_extract() helper function") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20220220154052.1308469-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e1daab49b0eb..c623c129d0ab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1977,7 +1977,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { @@ -5482,7 +5482,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb, /* Free pulled out fragments. */ while ((list = shinfo->frag_list) != insp) { shinfo->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { -- Gitee From df2f0cf3b5380b28391094c43c51777c839405ab Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Tue, 12 Apr 2022 15:56:17 +0800 Subject: [PATCH 178/340] gso: do not skip outer ip header in case of ipip and net_failover stable inclusion from linux-4.19.232 commit e9ffbe63f6f32f526a461756309b61c395168d73 -------------------------------- commit cc20cced0598d9a5ff91ae4ab147b3b5e99ee819 upstream. We encounter a tcp drop issue in our cloud environment. Packet GROed in host forwards to a VM virtio_net nic with net_failover enabled. VM acts as a IPVS LB with ipip encapsulation. The full path like: host gro -> vm virtio_net rx -> net_failover rx -> ipvs fullnat -> ipip encap -> net_failover tx -> virtio_net tx When net_failover transmits a ipip pkt (gso_type = 0x0103, which means SKB_GSO_TCPV4, SKB_GSO_DODGY and SKB_GSO_IPXIP4), there is no gso did because it supports TSO and GSO_IPXIP4. But network_header points to inner ip header. Call Trace: tcp4_gso_segment ------> return NULL inet_gso_segment ------> inner iph, network_header points to ipip_gso_segment inet_gso_segment ------> outer iph skb_mac_gso_segment Afterwards virtio_net transmits the pkt, only inner ip header is modified. And the outer one just keeps unchanged. The pkt will be dropped in remote host. Call Trace: inet_gso_segment ------> inner iph, outer iph is skipped skb_mac_gso_segment __skb_gso_segment validate_xmit_skb validate_xmit_skb_list sch_direct_xmit __qdisc_run __dev_queue_xmit ------> virtio_net dev_hard_start_xmit __dev_queue_xmit ------> net_failover ip_finish_output2 ip_output iptunnel_xmit ip_tunnel_xmit ipip_tunnel_xmit ------> ipip dev_hard_start_xmit __dev_queue_xmit ip_finish_output2 ip_output ip_forward ip_rcv __netif_receive_skb_one_core netif_receive_skb_internal napi_gro_receive receive_buf virtnet_poll net_rx_action The root cause of this issue is specific with the rare combination of SKB_GSO_DODGY and a tunnel device that adds an SKB_GSO_ tunnel option. SKB_GSO_DODGY is set from external virtio_net. We need to reset network header when callbacks.gso_segment() returns NULL. This patch also includes ipv6_gso_segment(), considering SIT, etc. Fixes: cb32f511a70b ("ipip: add GSO/TSO support") Signed-off-by: Tao Liu Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- net/ipv4/af_inet.c | 5 ++++- net/ipv6/ip6_offload.c | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 84ce4f86bc19..86268f3f9bb9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1338,8 +1338,11 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, } ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_segment)) + if (likely(ops && ops->callbacks.gso_segment)) { segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; + } if (IS_ERR_OR_NULL(segs)) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index c7e495f12011..6c47cd0ef240 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -98,6 +98,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) -- Gitee From 99c367d73932d7f5e19100a4515d8729e0514b8c Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Tue, 12 Apr 2022 15:56:18 +0800 Subject: [PATCH 179/340] tty: n_gsm: fix proper link termination after failed open stable inclusion from linux-4.19.232 commit 337e49675ce55c23a50b92aae889ec5d910d6dc7 -------------------------------- commit e3b7468f082d106459e86e8dc6fb9bdd65553433 upstream. Trying to open a DLCI by sending a SABM frame may fail with a timeout. The link is closed on the initiator side without informing the responder about this event. The responder assumes the link is open after sending a UA frame to answer the SABM frame. The link gets stuck in a half open state. This patch fixes this by initiating the proper link termination procedure after link setup timeout instead of silently closing it down. Fixes: e1eaea46bb40 ("tty: n_gsm line discipline") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-3-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/n_gsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 7de54c78a7b9..3e9c8a5b6445 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -1486,7 +1486,7 @@ static void gsm_dlci_t1(struct timer_list *t) dlci->mode = DLCI_MODE_ADM; gsm_dlci_open(dlci); } else { - gsm_dlci_close(dlci); + gsm_dlci_begin_close(dlci); /* prevent half open link */ } break; -- Gitee From 7287cbc17e84a4cf9c0488f353041c6880bad61d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 12 Apr 2022 15:56:19 +0800 Subject: [PATCH 180/340] memblock: use kfree() to release kmalloced memblock regions stable inclusion from linux-4.19.232 commit a6fcced73d15ab57cd97813999edf6f19d3032f8 -------------------------------- commit c94afc46cae7ad41b2ad6a99368147879f4b0e56 upstream. memblock.{reserved,memory}.regions may be allocated using kmalloc() in memblock_double_array(). Use kfree() to release these kmalloced regions indicated by memblock_{reserved,memory}_in_slab. Signed-off-by: Miaohe Lin Fixes: 3010f876500f ("mm: discard memblock data later") Signed-off-by: Mike Rapoport Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- mm/memblock.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 80c6975ace6f..daa2c42a02d9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -330,14 +330,20 @@ void __init memblock_discard(void) addr = __pa(memblock.reserved.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); - __memblock_free_late(addr, size); + if (memblock_reserved_in_slab) + kfree(memblock.reserved.regions); + else + __memblock_free_late(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { addr = __pa(memblock.memory.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); - __memblock_free_late(addr, size); + if (memblock_memory_in_slab) + kfree(memblock.memory.regions); + else + __memblock_free_late(addr, size); } } #endif -- Gitee From 8e22786899efe78757423845bbf081c03a4bd314 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Apr 2022 15:56:20 +0800 Subject: [PATCH 181/340] fget: clarify and improve __fget_files() implementation stable inclusion from linux-4.19.232 commit 400c2f361c25bc092d0636cfa32d0549a181e653 -------------------------------- commit e386dfc56f837da66d00a078e5314bc8382fab83 upstream. Commit 054aa8d439b9 ("fget: check that the fd still exists after getting a ref to it") fixed a race with getting a reference to a file just as it was being closed. It was a fairly minimal patch, and I didn't think re-checking the file pointer lookup would be a measurable overhead, since it was all right there and cached. But I was wrong, as pointed out by the kernel test robot. The 'poll2' case of the will-it-scale.per_thread_ops benchmark regressed quite noticeably. Admittedly it seems to be a very artificial test: doing "poll()" system calls on regular files in a very tight loop in multiple threads. That means that basically all the time is spent just looking up file descriptors without ever doing anything useful with them (not that doing 'poll()' on a regular file is useful to begin with). And as a result it shows the extra "re-check fd" cost as a sore thumb. Happily, the regression is fixable by just writing the code to loook up the fd to be better and clearer. There's still a cost to verify the file pointer, but now it's basically in the noise even for that benchmark that does nothing else - and the code is more understandable and has better comments too. [ Side note: this patch is also a classic case of one that looks very messy with the default greedy Myers diff - it's much more legible with either the patience of histogram diff algorithm ] Link: https://lore.kernel.org/lkml/20211210053743.GA36420@xsang-OptiPlex-9020/ Link: https://lore.kernel.org/lkml/20211213083154.GA20853@linux.intel.com/ Reported-by: kernel test robot Tested-by: Carel Si Cc: Jann Horn Cc: Miklos Szeredi Signed-off-by: Linus Torvalds Signed-off-by: Baokun Li Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- fs/file.c | 73 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/fs/file.c b/fs/file.c index 20a44d0ac400..eb10b7634d76 100644 --- a/fs/file.c +++ b/fs/file.c @@ -762,28 +762,69 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) +static inline struct file *__fget_files_rcu(struct files_struct *files, + unsigned int fd, fmode_t mask, unsigned int refs) { - struct files_struct *files = current->files; - struct file *file; + for (;;) { + struct file *file; + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + struct file __rcu **fdentry; - rcu_read_lock(); -loop: - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken. - * dup2() atomicity guarantee is the reason - * we loop to catch the new file (or NULL pointer) + if (unlikely(fd >= fdt->max_fds)) + return NULL; + + fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + file = rcu_dereference_raw(*fdentry); + if (unlikely(!file)) + return NULL; + + if (unlikely(file->f_mode & mask)) + return NULL; + + /* + * Ok, we have a file pointer. However, because we do + * this all locklessly under RCU, we may be racing with + * that file being closed. + * + * Such a race can take two forms: + * + * (a) the file ref already went down to zero, + * and get_file_rcu_many() fails. Just try + * again: + */ + if (unlikely(!get_file_rcu_many(file, refs))) + continue; + + /* + * (b) the file table entry has changed under us. + * Note that we don't need to re-check the 'fdt->fd' + * pointer having changed, because it always goes + * hand-in-hand with 'fdt'. + * + * If so, we need to put our refs and try again. */ - if (file->f_mode & mask) - file = NULL; - else if (!get_file_rcu_many(file, refs)) - goto loop; - else if (__fcheck_files(files, fd) != file) { + if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || + unlikely(rcu_dereference_raw(*fdentry) != file)) { fput_many(file, refs); - goto loop; + continue; } + + /* + * Ok, we have a ref to the file, and checked that it + * still exists. + */ + return file; } +} + + +static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) +{ + struct files_struct *files = current->files; + struct file *file; + + rcu_read_lock(); + file = __fget_files_rcu(files, fd, mask, refs); rcu_read_unlock(); return file; -- Gitee From be439c06daf3bff8bc50c6317be8410fc49e400f Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Tue, 12 Apr 2022 15:56:21 +0800 Subject: [PATCH 182/340] tty: n_gsm: fix encoding of control signal octet bit DV stable inclusion from linux-4.19.232 commit 28ca082153794cf5c98e7bb93d7f30f8ba46bec4 -------------------------------- commit 737b0ef3be6b319d6c1fd64193d1603311969326 upstream. n_gsm is based on the 3GPP 07.010 and its newer version is the 3GPP 27.010. See https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=1516 The changes from 07.010 to 27.010 are non-functional. Therefore, I refer to the newer 27.010 here. Chapter 5.4.6.3.7 describes the encoding of the control signal octet used by the MSC (modem status command). The same encoding is also used in convergence layer type 2 as described in chapter 5.5.2. Table 7 and 24 both require the DV (data valid) bit to be set 1 for outgoing control signal octets sent by the DTE (data terminal equipment), i.e. for the initiator side. Currently, the DV bit is only set if CD (carrier detect) is on, regardless of the side. This patch fixes this behavior by setting the DV bit on the initiator side unconditionally. Fixes: e1eaea46bb40 ("tty: n_gsm line discipline") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-1-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- drivers/tty/n_gsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 3e9c8a5b6445..3f08ad7e629a 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -428,7 +428,7 @@ static u8 gsm_encode_modem(const struct gsm_dlci *dlci) modembits |= MDM_RTR; if (dlci->modem_tx & TIOCM_RI) modembits |= MDM_IC; - if (dlci->modem_tx & TIOCM_CD) + if (dlci->modem_tx & TIOCM_CD || dlci->gsm->initiator) modembits |= MDM_DV; return modembits; } -- Gitee From dc8da50c3488f9d6ae233b3afb0334c1aa09c7a5 Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Tue, 12 Apr 2022 15:56:22 +0800 Subject: [PATCH 183/340] Revert "module, async: async_synchronize_full() on module init iff async is used" stable inclusion from linux-4.19.231 commit a0c66ac8b72f816d5631fde0ca0b39af602dce48 -------------------------------- [ Upstream commit 67d6212afda218d564890d1674bab28e8612170f ] This reverts commit 774a1221e862b343388347bac9b318767336b20b. We need to finish all async code before the module init sequence is done. In the reverted commit the PF_USED_ASYNC flag was added to mark a thread that called async_schedule(). Then the PF_USED_ASYNC flag was used to determine whether or not async_synchronize_full() needs to be invoked. This works when modprobe thread is calling async_schedule(), but it does not work if module dispatches init code to a worker thread which then calls async_schedule(). For example, PCI driver probing is invoked from a worker thread based on a node where device is attached: if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); We end up in a situation where a worker thread gets the PF_USED_ASYNC flag set instead of the modprobe thread. As a result, async_synchronize_full() is not invoked and modprobe completes without waiting for the async code to finish. The issue was discovered while loading the pm80xx driver: (scsi_mod.scan=async) modprobe pm80xx worker ... do_init_module() ... pci_call_probe() work_on_cpu(local_pci_probe) local_pci_probe() pm8001_pci_probe() scsi_scan_host() async_schedule() worker->flags |= PF_USED_ASYNC; ... < return from worker > ... if (current->flags & PF_USED_ASYNC) <--- false async_synchronize_full(); Commit 21c3c5d28007 ("block: don't request module during elevator init") fixed the deadlock issue which the reverted commit 774a1221e862 ("module, async: async_synchronize_full() on module init iff async is used") tried to fix. Since commit 0fdff3ec6d87 ("async, kmod: warn on synchronous request_module() from async workers") synchronous module loading from async is not allowed. Given that the original deadlock issue is fixed and it is no longer allowed to call synchronous request_module() from async we can remove PF_USED_ASYNC flag to make module init consistently invoke async_synchronize_full() unless async module probe is requested. Signed-off-by: Igor Pylypiv Reviewed-by: Changyuan Lyu Reviewed-by: Luis Chamberlain Acked-by: Tejun Heo Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Signed-off-by: Laibin Qiu --- include/linux/sched.h | 1 - kernel/async.c | 3 --- kernel/module.c | 25 +++++-------------------- 3 files changed, 5 insertions(+), 24 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0202fe007318..ec7418ff12d4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1432,7 +1432,6 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/kernel/async.c b/kernel/async.c index a893d6170944..4bf1b00a28d8 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -191,9 +191,6 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work(system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 38c583d54def..591c176ac6a2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3493,12 +3493,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3524,22 +3518,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. - * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + -- Gitee From 2ea2de935b1cbae65e60c53e9dd82c130f06e447 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 14 Apr 2022 09:33:35 +0000 Subject: [PATCH 184/340] ovl: sync dirty data when remounting to ro mode mainline inclusion from mainline-v5.8-rc1 commit 399c109d357a7e217cf7ef551e7e234439c68c15 category: bugfix bugzilla: 186580, https://gitee.com/openeuler/kernel/issues/I52LVN CVE: NA -------------------------------- sync_filesystem() does not sync dirty data for readonly filesystem during umount, so before changing to readonly filesystem we should sync dirty data for data integrity. Signed-off-by: Chengguang Xu Signed-off-by: Miklos Szeredi Signed-off-by: Zheng Liang Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/overlayfs/super.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 6729999f4a66..45fe0c71acf6 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -377,11 +377,20 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ofs = sb->s_fs_info; + struct super_block *upper_sb; + int ret = 0; if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs)) return -EROFS; - return 0; + if (*flags & SB_RDONLY && !sb_rdonly(sb)) { + upper_sb = ofs->upper_mnt->mnt_sb; + down_read(&upper_sb->s_umount); + ret = sync_filesystem(upper_sb); + up_read(&upper_sb->s_umount); + } + + return ret; } static const struct super_operations ovl_super_operations = { -- Gitee From b52ef3c66e2e1bcea44269e8738a8c9deedf6851 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 Apr 2022 09:33:36 +0000 Subject: [PATCH 185/340] ovl: fix lseek overflow on 32bit mainline inclusion from mainline-v5.6-rc1 commit a4ac9d45c0cd14a2adc872186431c79804b77dbf category: bugfix bugzilla: 95402, https://gitee.com/openeuler/kernel/issues/I52LW9 CVE: NA -------------------------------- ovl_lseek() is using ssize_t to return the value from vfs_llseek(). On a 32-bit kernel ssize_t is a 32-bit signed int, which overflows above 2 GB. Assign the return value of vfs_llseek() to loff_t to fix this. Reported-by: Boris Gjenero Fixes: 9e46b840c705 ("ovl: support stacked SEEK_HOLE/SEEK_DATA") Cc: # v4.19 Signed-off-by: Miklos Szeredi Signed-off-by: Zheng Liang Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/overlayfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index cd1c94f77dee..470ea215bebc 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -159,7 +159,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); struct fd real; const struct cred *old_cred; - ssize_t ret; + loff_t ret; /* * The two special cases below do not need to involve real fs, -- Gitee From b453e4a7c462a1182a2427029475e39ecade22d3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 Apr 2022 09:33:37 +0000 Subject: [PATCH 186/340] ovl: fix IOCB_DIRECT if underlying fs doesn't support direct IO mainline inclusion from mainline-v5.15-rc5 commit 1dc1eed46f9fa4cb8a07baa24fb44c96d6dd35c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I51NT0 CVE: NA -------------------------------- Normally the check at open time suffices, but e.g loop device does set IOCB_DIRECT after doing its own checks (which are not sufficent for overlayfs). Make sure we don't call the underlying filesystem read/write method with the IOCB_DIRECT if it's not supported. Reported-by: Huang Jianan Fixes: 16914e6fc7e1 ("ovl: add ovl_read_iter()") Cc: # v4.19 Tested-by: Huang Jianan Signed-off-by: Miklos Szeredi conflicts: fs/overlayfs/file.c Signed-off-by: Zheng Liang Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/overlayfs/file.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 470ea215bebc..63b7b2600d81 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -252,13 +252,19 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, ovl_iocb_to_rwf(iocb)); revert_creds(old_cred); ovl_file_accessed(file); - +out_fdput: fdput(real); return ret; @@ -286,6 +292,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) goto out_unlock; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + old_cred = ovl_override_creds(file_inode(file)->i_sb); file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, @@ -295,7 +307,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) /* Update size */ ovl_copyattr(ovl_inode_real(inode), inode); - +out_fdput: fdput(real); out_unlock: -- Gitee From c1662fc5abe162b5d73e11cc524af8fa26e42680 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 Apr 2022 09:33:38 +0000 Subject: [PATCH 187/340] ovl: fix uninitialized pointer read in ovl_lookup_real_one() mainline inclusion from mainline-v5.14-rc6 commit 580c610429b3994e8db24418927747cf28443cde category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I51PNB CVE: NA -------------------------------- One error path can result in release_dentry_name_snapshot() being called before "name" was initialized by take_dentry_name_snapshot(). Fix by moving the release_dentry_name_snapshot() to immediately after the only use. Reported-by: Colin Ian King Signed-off-by: Miklos Szeredi conflicts: fs/overlayfs/export.c Signed-off-by: Zheng Liang Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/overlayfs/export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 4028e04a7b0b..bf6c97ce6d8b 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -398,6 +398,7 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); this = lookup_one_len(name.name, connected, strlen(name.name)); + release_dentry_name_snapshot(&name); err = PTR_ERR(this); if (IS_ERR(this)) { goto fail; @@ -412,7 +413,6 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, } out: - release_dentry_name_snapshot(&name); dput(parent); inode_unlock(dir); return this; -- Gitee From 1b1798702f1dabcf5dc978cfc86f43906f2879f6 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 14 Apr 2022 09:33:39 +0000 Subject: [PATCH 188/340] hamradio: defer 6pack kfree after unregister_netdev mainline inclusion from mainline-v5.16-rc1 commit 0b9111922b1f399aba6ed1e1b8f2079c3da1aed8 category: bugfix bugzilla: 186546, https://gitee.com/openeuler/kernel/issues/I51J84 CVE: CVE-2022-1198 -------------------------------- There is a possible race condition (use-after-free) like below (USE) | (FREE) dev_queue_xmit | __dev_queue_xmit | __dev_xmit_skb | sch_direct_xmit | ... xmit_one | netdev_start_xmit | tty_ldisc_kill __netdev_start_xmit | 6pack_close sp_xmit | kfree sp_encaps | | According to the patch "defer ax25 kfree after unregister_netdev", this patch reorder the kfree after the unregister_netdev to avoid the possible UAF as the unregister_netdev() is well synchronized and won't return if there is a running routine. Signed-off-by: Lin Ma Signed-off-by: David S. Miller Signed-off-by: Xu Jia Reviewed-by: Wei Yongjun Reviewed-by: Wang Weiyang Signed-off-by: Yongqiang Liu --- drivers/net/hamradio/6pack.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 1001e9a2edd4..91180761b8cb 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -677,11 +677,13 @@ static void sixpack_close(struct tty_struct *tty) del_timer_sync(&sp->tx_t); del_timer_sync(&sp->resync_t); - /* Free all 6pack frame buffers. */ + unregister_netdev(sp->dev); + + /* Free all 6pack frame buffers after unreg. */ kfree(sp->rbuff); kfree(sp->xbuff); - unregister_netdev(sp->dev); + free_netdev(sp->dev); } /* Perform I/O control on an active 6pack channel. */ -- Gitee From 2170508665c2533a62505f129b91e0c90397884a Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 14 Apr 2022 09:33:40 +0000 Subject: [PATCH 189/340] hamradio: remove needs_free_netdev to avoid UAF mainline inclusion from mainline-v5.16-rc2 commit 81b1d548d00bcd028303c4f3150fa753b9b8aa71 category: bugfix bugzilla: 186546, https://gitee.com/openeuler/kernel/issues/I51J84 CVE: CVE-2022-1198 -------------------------------- The former patch "defer 6pack kfree after unregister_netdev" reorders the kfree of two buffer after the unregister_netdev to prevent the race condition. It also adds free_netdev() function in sixpack_close(), which is a direct copy from the similar code in mkiss_close(). However, in sixpack driver, the flag needs_free_netdev is set to true in sp_setup(), hence the unregister_netdev() will free the netdev automatically. Therefore, as the sp is netdev_priv, use-after-free occurs. This patch removes the needs_free_netdev = true and just let the free_netdev to finish this deallocation task. Fixes: 0b9111922b1f ("hamradio: defer 6pack kfree after unregister_netdev") Signed-off-by: Lin Ma Link: https://lore.kernel.org/r/20211111141402.7551-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Xu Jia Reviewed-by: Wei Yongjun Reviewed-by: Wang Weiyang Signed-off-by: Yongqiang Liu --- drivers/net/hamradio/6pack.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 91180761b8cb..592c1d92a8e0 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -311,7 +311,6 @@ static void sp_setup(struct net_device *dev) { /* Finish setting up the DEVICE info. */ dev->netdev_ops = &sp_netdev_ops; - dev->needs_free_netdev = true; dev->mtu = SIXP_MTU; dev->hard_header_len = AX25_MAX_HEADER_LEN; dev->header_ops = &ax25_header_ops; -- Gitee From 12f56ab1a40bd5caa1d83ecaf9d32593f51fbd82 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Thu, 14 Apr 2022 09:33:41 +0000 Subject: [PATCH 190/340] drivers: hamradio: 6pack: fix UAF bug caused by mod_timer() mainline inclusion from mainline-v5.18-rc1 commit efe4186e6a1b54bf38b9e05450d43b0da1fd7739 category: bugfix bugzilla: 186546, https://gitee.com/openeuler/kernel/issues/I51J84 CVE: CVE-2022-1198 -------------------------------- When a 6pack device is detaching, the sixpack_close() will act to cleanup necessary resources. Although del_timer_sync() in sixpack_close() won't return if there is an active timer, one could use mod_timer() in sp_xmit_on_air() to wake up timer again by calling userspace syscall such as ax25_sendmsg(), ax25_connect() and ax25_ioctl(). This unexpected waked handler, sp_xmit_on_air(), realizes nothing about the undergoing cleanup and may still call pty_write() to use driver layer resources that have already been released. One of the possible race conditions is shown below: (USE) | (FREE) ax25_sendmsg() | ax25_queue_xmit() | ... | sp_xmit() | sp_encaps() | sixpack_close() sp_xmit_on_air() | del_timer_sync(&sp->tx_t) mod_timer(&sp->tx_t,...) | ... | unregister_netdev() | ... (wait a while) | tty_release() | tty_release_struct() | release_tty() sp_xmit_on_air() | tty_kref_put(tty_struct) //FREE pty_write(tty_struct) //USE | ... The corresponding fail log is shown below: Reviewed-by: Wei Yongjun Reviewed-by: Wang Weiyang =============================================================== BUG: KASAN: use-after-free in __run_timers.part.0+0x170/0x470 Write of size 8 at addr ffff88800a652ab8 by task swapper/2/0 ... Call Trace: ... queue_work_on+0x3f/0x50 pty_write+0xcd/0xe0pty_write+0xcd/0xe0 sp_xmit_on_air+0xb2/0x1f0 call_timer_fn+0x28/0x150 __run_timers.part.0+0x3c2/0x470 run_timer_softirq+0x3b/0x80 __do_softirq+0xf1/0x380 ... This patch reorders the del_timer_sync() after the unregister_netdev() to avoid UAF bugs. Because the unregister_netdev() is well synchronized, it flushs out any pending queues, waits the refcount of net_device decreases to zero and removes net_device from kernel. There is not any running routines after executing unregister_netdev(). Therefore, we could not arouse timer from userspace again. Signed-off-by: Duoming Zhou Reviewed-by: Lin Ma Signed-off-by: David S. Miller Signed-off-by: Xu Jia Signed-off-by: Yongqiang Liu --- drivers/net/hamradio/6pack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 592c1d92a8e0..6f7e1598106f 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -673,11 +673,11 @@ static void sixpack_close(struct tty_struct *tty) */ netif_stop_queue(sp->dev); + unregister_netdev(sp->dev); + del_timer_sync(&sp->tx_t); del_timer_sync(&sp->resync_t); - unregister_netdev(sp->dev); - /* Free all 6pack frame buffers after unreg. */ kfree(sp->rbuff); kfree(sp->xbuff); -- Gitee From 3ef161fb8febd46bf96b669b98e2efc590873105 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Apr 2022 09:33:42 +0000 Subject: [PATCH 191/340] mm: page_counter: mitigate consequences of a page_counter underflow mainline inclusion from mainline-v5.13-rc1 commit 9317d0fffeb4c3929069cfc7377cfa2a7cd36d1d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5241W CVE: NA -------------------------------- When the unsigned page_counter underflows, even just by a few pages, a cgroup will not be able to run anything afterwards and trigger the OOM killer in a loop. Underflows shouldn't happen, but when they do in practice, we may just be off by a small amount that doesn't interfere with the normal operation - consequences don't need to be that dire. Reset the page_counter to 0 upon underflow. We'll issue a warning that the accounting will be off and then try to keep limping along. [ We used to do this with the original res_counter, where it was a more straight-forward correction inside the spinlock section. I didn't carry it forward into the lockless page counters for simplicity, but it turns out this is quite useful in practice. ] Link: https://lkml.kernel.org/r/20210408143155.2679744-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Chris Down Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/page_counter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/page_counter.c b/mm/page_counter.c index 147ff99187b8..58903e3bfdd3 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -57,9 +57,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_protected_usage(counter, new); /* More uncharges than charges? */ - WARN_ON_ONCE(new < 0); + if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", + new, nr_pages)) { + new = 0; + atomic_long_set(&counter->usage, new); + } + propagate_protected_usage(counter, new); } /** -- Gitee From 456a929c284b5221a1106074e545f88aee8aeca4 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Thu, 14 Apr 2022 09:33:43 +0000 Subject: [PATCH 192/340] mm: Drop reliable_reserve_size hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Commit 368d710d7f32 ("mm: Fallback to non-mirrored region below low watermark") already set the default value of reliable_reserve_size to zero which will disable reliable watermark check by default. With this patch, code related to this mechanism is removed since no one use this watermark check. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- include/linux/mem_reliable.h | 17 ----------------- mm/mem_reliable.c | 33 --------------------------------- mm/page_alloc.c | 4 ---- 3 files changed, 54 deletions(-) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 8340a24fe76d..6d57c36fb676 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -24,7 +24,6 @@ DECLARE_PER_CPU(long, nr_reliable_buddy_pages); extern struct percpu_counter pagecache_reliable_pages; extern struct percpu_counter anon_reliable_pages; -extern unsigned long nr_reliable_reserve_pages __read_mostly; extern long shmem_reliable_nr_page __read_mostly; extern void add_reliable_mem_size(long sz); @@ -118,21 +117,6 @@ static inline void mem_reliable_buddy_counter(struct page *page, int nr_page) this_cpu_add(nr_reliable_buddy_pages, nr_page); } -/* reserve mirrored memory for kernel usage */ -static inline bool mem_reliable_watermark_ok(int nr_page) -{ - long sum = 0; - int cpu; - - if (!reliable_allow_fb_enabled()) - return true; - - for_each_possible_cpu(cpu) - sum += per_cpu(nr_reliable_buddy_pages, cpu); - - return sum > nr_reliable_reserve_pages; -} - static inline bool mem_reliable_shmem_limit_check(void) { return percpu_counter_read_positive(&reliable_shmem_used_nr_page) < @@ -178,7 +162,6 @@ static inline void shmem_reliable_page_counter(struct page *page, int nr_page) static inline bool pagecache_reliable_is_enabled(void) { return false; } static inline bool mem_reliable_status(void) { return false; } static inline void mem_reliable_buddy_counter(struct page *page, int nr_page) {} -static inline bool mem_reliable_watermark_ok(int nr_page) { return true; } static inline bool mem_reliable_shmem_limit_check(void) { return true; } static inline void reliable_lru_add(enum lru_list lru, struct page *page, diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 6d4ab4bee3d5..83bfdf265273 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -11,8 +11,6 @@ #include #include -#define MEM_RELIABLE_RESERVE_MIN 0 - enum mem_reliable_types { MEM_RELIABLE_ALL, MEM_RELIABLE_FALLBACK, @@ -31,7 +29,6 @@ bool reliable_allow_fallback __read_mostly = true; bool shmem_reliable __read_mostly = true; struct percpu_counter reliable_shmem_used_nr_page __read_mostly; DEFINE_PER_CPU(long, nr_reliable_buddy_pages); -unsigned long nr_reliable_reserve_pages = MEM_RELIABLE_RESERVE_MIN / PAGE_SIZE; long shmem_reliable_nr_page = LONG_MAX; bool pagecache_use_reliable_mem __read_mostly = true; @@ -338,29 +335,6 @@ int reliable_debug_handler(struct ctl_table *table, int write, return ret; } -static unsigned long sysctl_reliable_reserve_size = MEM_RELIABLE_RESERVE_MIN; - -int reliable_reserve_size_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - unsigned long *data_ptr = (unsigned long *)(table->data); - unsigned long old = *data_ptr; - int ret; - - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); - if (ret == 0 && write) { - if (*data_ptr > total_reliable_mem_sz() || - *data_ptr < MEM_RELIABLE_RESERVE_MIN) { - *data_ptr = old; - return -EINVAL; - } - - nr_reliable_reserve_pages = *data_ptr / PAGE_SIZE; - } - - return ret; -} - #ifdef CONFIG_SHMEM static unsigned long sysctl_shmem_reliable_bytes_limit = ULONG_MAX; @@ -417,13 +391,6 @@ static struct ctl_table reliable_ctl_table[] = { .mode = 0600, .proc_handler = reliable_debug_handler, }, - { - .procname = "reliable_reserve_size", - .data = &sysctl_reliable_reserve_size, - .maxlen = sizeof(sysctl_reliable_reserve_size), - .mode = 0644, - .proc_handler = reliable_reserve_size_handler, - }, #ifdef CONFIG_SHMEM { .procname = "shmem_reliable_bytes_limit", diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d4f75235420..38f2a84d3224 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4649,10 +4649,6 @@ static inline bool check_after_alloc(gfp_t *gfp_mask, unsigned int order, if (*gfp_mask & __GFP_NOFAIL) goto out; - /* check water mark, reserver mirrored mem for kernel */ - if (!mem_reliable_watermark_ok(1 << order)) - goto out_free_page; - /* percpu counter is not initialized, ignore limit check */ if (!mem_reliable_counter_initialized()) goto out; -- Gitee From 9f07eb30e62486a47d70adc4568af3265febe568 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Thu, 14 Apr 2022 09:33:44 +0000 Subject: [PATCH 193/340] mm: Add space after ReliableFileCache hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Add a space after ReliableFileCache so that the code is easy to read and maintain. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/mem_reliable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 83bfdf265273..646c27993df4 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -230,7 +230,8 @@ void reliable_report_meminfo(struct seq_file *m) num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); show_val_kb(m, "FileCache: ", num); - show_val_kb(m, "ReliableFileCache:", nr_pagecache_pages); + seq_printf(m, "ReliableFileCache: %8llu kB\n", + nr_pagecache_pages); } } -- Gitee From 92a256aca17c823413f3fdb529ac57359d3f780b Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Thu, 14 Apr 2022 09:33:45 +0000 Subject: [PATCH 194/340] can: ems_usb: ems_usb_start_xmit(): fix double dev_kfree_skb() in error path mainline inclusion from mainline-v5.18-rc1 commit c70222752228a62135cee3409dccefd494a24646 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I51YBP CVE: CVE-2022-28390 -------------------------------- There is no need to call dev_kfree_skb() when usb_submit_urb() fails beacause can_put_echo_skb() deletes the original skb and can_free_echo_skb() deletes the cloned skb. Link: https://lore.kernel.org/all/20220228083639.38183-1-hbh25y@gmail.com Fixes: 702171adeed3 ("ems_usb: Added support for EMS CPC-USB/ARM7 CAN/USB interface") Cc: stable@vger.kernel.org Cc: Sebastian Haas Signed-off-by: Hangyu Hua Signed-off-by: Marc Kleine-Budde Signed-off-by: Baisong Zhong Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- drivers/net/can/usb/ems_usb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index b7dfd4109d24..7a92f640c379 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -823,7 +823,6 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne usb_unanchor_urb(urb); usb_free_coherent(dev->udev, size, buf, urb->transfer_dma); - dev_kfree_skb(skb); atomic_dec(&dev->active_tx_urbs); -- Gitee From c82b8d71f1d43c7a387c8de3a1432bd6a88c24b1 Mon Sep 17 00:00:00 2001 From: Lijun Fang Date: Thu, 14 Apr 2022 09:33:46 +0000 Subject: [PATCH 195/340] svm: Export symbols for svm module ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4UR5Y CVE: NA -------------------- Export symbols for svm module. Signed-off-by: Lijun Fang Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- arch/arm64/mm/context.c | 2 ++ drivers/iommu/iommu.c | 1 + mm/share_pool.c | 1 + 3 files changed, 4 insertions(+) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 35a103c7c22b..e7cc878a0b05 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -310,6 +310,7 @@ unsigned long mm_context_get(struct mm_struct *mm) return asid; } +EXPORT_SYMBOL_GPL(mm_context_get); void mm_context_put(struct mm_struct *mm) { @@ -325,6 +326,7 @@ void mm_context_put(struct mm_struct *mm) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); } +EXPORT_SYMBOL_GPL(mm_context_put); /* Errata workaround post TTBRx_EL1 update. */ asmlinkage void post_ttbr_update_workaround(void) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 63bd5940b290..84a59762b037 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2287,6 +2287,7 @@ int iommu_request_dm_for_dev(struct device *dev) return ret; } +EXPORT_SYMBOL_GPL(iommu_request_dm_for_dev); const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) { diff --git a/mm/share_pool.c b/mm/share_pool.c index 90733b807f12..40147ef85ba5 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -91,6 +91,7 @@ static int __read_mostly enable_share_k2u_spg = 1; int sysctl_ac_mode = AC_NONE; /* debug mode */ int sysctl_sp_debug_mode; +EXPORT_SYMBOL(sysctl_sp_debug_mode); int sysctl_share_pool_map_lock_enable; -- Gitee From fd64ea2e7f74c11a088b3ce6701e704b8ff728a5 Mon Sep 17 00:00:00 2001 From: Lijun Fang Date: Thu, 14 Apr 2022 09:33:47 +0000 Subject: [PATCH 196/340] svm: Delete get meminfo interface in svm ioctl ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4UR5Y CVE: NA -------------------- The functions(svm_get_hugeinfo, svm_get_phy_memory_info) can be replaced by reading /proc/meminfo, we will never use these functions. Signed-off-by: Lijun Fang Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- drivers/char/svm.c | 118 --------------------------------------------- 1 file changed, 118 deletions(-) diff --git a/drivers/char/svm.c b/drivers/char/svm.c index 098b84529d0e..c48cbce39661 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -43,8 +43,6 @@ #define SVM_IOCTL_LOAD_FLAG 0xfffa #define SVM_IOCTL_PIN_MEMORY 0xfff7 #define SVM_IOCTL_UNPIN_MEMORY 0xfff5 -#define SVM_IOCTL_GETHUGEINFO 0xfff6 -#define SVM_IOCTL_GET_PHYMEMINFO 0xfff8 #define SVM_IOCTL_REMAP_PROC 0xfff4 #define SVM_REMAP_MEM_LEN_MAX (16 * 1024 * 1024) @@ -123,23 +121,6 @@ struct svm_proc_mem { u64 buf; }; -struct meminfo { - unsigned long hugetlbfree; - unsigned long hugetlbtotal; -}; - -struct phymeminfo { - unsigned long normal_total; - unsigned long normal_free; - unsigned long huge_total; - unsigned long huge_free; -}; - -struct phymeminfo_ioctl { - struct phymeminfo *info; - unsigned long nodemask; -}; - struct spalloc { unsigned long addr; unsigned long size; @@ -169,10 +150,6 @@ static char *svm_cmd_to_string(unsigned int cmd) return "pin memory"; case SVM_IOCTL_UNPIN_MEMORY: return "unpin memory"; - case SVM_IOCTL_GETHUGEINFO: - return "get hugeinfo"; - case SVM_IOCTL_GET_PHYMEMINFO: - return "get physical memory info"; case SVM_IOCTL_REMAP_PROC: return "remap proc"; case SVM_IOCTL_LOAD_FLAG: @@ -1562,95 +1539,6 @@ static int svm_set_rc(unsigned long __user *arg) return 0; } -static long svm_get_hugeinfo(unsigned long __user *arg) -{ - struct hstate *h = &default_hstate; - struct meminfo info; - - if (!acpi_disabled) - return -EPERM; - - if (arg == NULL) - return -EINVAL; - - if (!hugepages_supported()) - return -ENOTSUPP; - - info.hugetlbfree = h->free_huge_pages; - info.hugetlbtotal = h->nr_huge_pages; - - if (copy_to_user((void __user *)arg, &info, sizeof(info))) - return -EFAULT; - - pr_info("svm get hugetlb info: order(%u), max_huge_pages(%lu)," - "nr_huge_pages(%lu), free_huge_pages(%lu), resv_huge_pages(%lu)", - h->order, - h->max_huge_pages, - h->nr_huge_pages, - h->free_huge_pages, - h->resv_huge_pages); - - return 0; -} - -static void svm_get_node_memory_info_inc(unsigned long nid, struct phymeminfo *info) -{ - struct sysinfo i; - struct hstate *h = &default_hstate; - unsigned long huge_free = 0; - unsigned long huge_total = 0; - - if (hugepages_supported()) { - huge_free = h->free_huge_pages_node[nid] * (PAGE_SIZE << huge_page_order(h)); - huge_total = h->nr_huge_pages_node[nid] * (PAGE_SIZE << huge_page_order(h)); - } - -#ifdef CONFIG_NUMA - si_meminfo_node(&i, nid); -#else - si_meminfo(&i); -#endif - info->normal_free += i.freeram * PAGE_SIZE; - info->normal_total += i.totalram * PAGE_SIZE - huge_total; - info->huge_total += huge_total; - info->huge_free += huge_free; -} - -static void __svm_get_memory_info(unsigned long nodemask, struct phymeminfo *info) -{ - memset(info, 0x0, sizeof(struct phymeminfo)); - - nodemask = nodemask & ((1UL << MAX_NUMNODES) - 1); - - while (nodemask) { - unsigned long nid = find_first_bit(&nodemask, BITS_PER_LONG); - if (node_isset(nid, node_online_map)) { - (void)svm_get_node_memory_info_inc(nid, info); - } - - nodemask &= ~(1UL << nid); - } -} - -static long svm_get_phy_memory_info(unsigned long __user *arg) -{ - struct phymeminfo info; - struct phymeminfo_ioctl para; - - if (arg == NULL) - return -EINVAL; - - if (copy_from_user(¶, (void __user *)arg, sizeof(para))) - return -EFAULT; - - __svm_get_memory_info(para.nodemask, &info); - - if (copy_to_user((void __user *)para.info, &info, sizeof(info))) - return -EFAULT; - - return 0; -} - static long svm_remap_get_phys(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long *phys, unsigned long *page_size, unsigned long *offset) @@ -2092,12 +1980,6 @@ static long svm_ioctl(struct file *file, unsigned int cmd, case SVM_IOCTL_UNPIN_MEMORY: err = svm_unpin_memory((unsigned long __user *)arg); break; - case SVM_IOCTL_GETHUGEINFO: - err = svm_get_hugeinfo((unsigned long __user *)arg); - break; - case SVM_IOCTL_GET_PHYMEMINFO: - err = svm_get_phy_memory_info((unsigned long __user *)arg); - break; case SVM_IOCTL_REMAP_PROC: err = svm_remap_proc((unsigned long __user *)arg); break; -- Gitee From 6fe391e5c3b97d2e60fcaad102ab96852a5e033b Mon Sep 17 00:00:00 2001 From: Lijun Fang Date: Thu, 14 Apr 2022 09:33:48 +0000 Subject: [PATCH 197/340] svm: Delete unused function sysrq_sched_debug_show_export ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4UR5Y CVE: NA -------------------- Delete unused function sysrq_sched_debug_show_export Signed-off-by: Lijun Fang Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- drivers/char/svm.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/char/svm.c b/drivers/char/svm.c index c48cbce39661..864cb79c6738 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -163,8 +163,6 @@ static char *svm_cmd_to_string(unsigned int cmd) return NULL; } -extern void sysrq_sched_debug_tidy(void); - /* * image word of slot * SVM_IMAGE_WORD_INIT: initial value, indicating that the slot is not used. @@ -423,17 +421,6 @@ static int svm_va2pa_trunk_init(struct device *dev) return 0; } -void sysrq_sched_debug_show_export(void) -{ -#ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_tidy(); -#else - pr_err("Not open CONFIG_SCHED_DEBUG\n"); -#endif - panic("pcie heart miss\n"); -} -EXPORT_SYMBOL(sysrq_sched_debug_show_export); - static struct svm_process *find_svm_process(unsigned long asid) { struct rb_node *node = svm_process_root.rb_node; -- Gitee From 492bf18ea8aac80805383a4b898ac7e999526d9d Mon Sep 17 00:00:00 2001 From: Lijun Fang Date: Thu, 14 Apr 2022 09:33:49 +0000 Subject: [PATCH 198/340] ascend: mm: Add MAP_ALIGN flag to map aligned va ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4UR5Y CVE: NA -------------------- svm module use svm_get_unmapped_area ops to map an aligned va, which used by mapping l2buf memory. The svm_get_unmapped_area use a lot of duplicated codes, we add MAP_ALIGN to adjust mapinfo alignmask. Signed-off-by: Lijun Fang Reviewed-by: Kefeng Wang Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- arch/alpha/include/uapi/asm/mman.h | 1 + arch/mips/include/uapi/asm/mman.h | 1 + arch/parisc/include/uapi/asm/mman.h | 1 + arch/powerpc/include/uapi/asm/mman.h | 1 + arch/sparc/include/uapi/asm/mman.h | 1 + arch/xtensa/include/uapi/asm/mman.h | 2 ++ include/uapi/asm-generic/mman.h | 2 ++ mm/mmap.c | 8 ++++++++ 8 files changed, 17 insertions(+) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 1c7ce2716ad3..977df8306905 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -35,6 +35,7 @@ #define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 4570a54ac1d9..1a6ffb5515ed 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -53,6 +53,7 @@ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ /* * Flags for msync diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 06857eb1bee8..8d88c457fe10 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -29,6 +29,7 @@ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index 24354f792b00..802cc3fe1358 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -31,6 +31,7 @@ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ /* Override any generic PKEY permission defines */ #define PKEY_DISABLE_EXECUTE 0x4 diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h index 214abe17f44a..953528235c2e 100644 --- a/arch/sparc/include/uapi/asm/mman.h +++ b/arch/sparc/include/uapi/asm/mman.h @@ -28,6 +28,7 @@ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ #endif /* _UAPI__SPARC_MMAN_H__ */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 2c9d70560238..97239cce3ffc 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -67,6 +67,8 @@ # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ + /* * Flags for msync */ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index defdf92911c3..028b4e846185 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -21,6 +21,8 @@ #define MAP_REPLACE 0x1000000 #endif +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ + /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ #define MCL_CURRENT 1 /* lock all current mappings */ diff --git a/mm/mmap.c b/mm/mmap.c index e529384b8ac6..ce3fba7c31cb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2382,6 +2382,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, sp_area_work_around(&info, flags); + /* Align vmstart to len */ + if (filp && (flags & MAP_ALIGN)) + info.align_mask = ((len >> PAGE_SHIFT) - 1) << PAGE_SHIFT; + return vm_unmapped_area(&info); } #endif @@ -2437,6 +2441,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, sp_area_work_around(&info, flags); + /* Align vmstart to len */ + if (filp && (flags & MAP_ALIGN)) + info.align_mask = ((len >> PAGE_SHIFT) - 1) << PAGE_SHIFT; + addr = vm_unmapped_area(&info); /* -- Gitee From 0e5d94f9e3f905f9faf3ca12280df300271c80a8 Mon Sep 17 00:00:00 2001 From: Lijun Fang Date: Thu, 14 Apr 2022 09:33:50 +0000 Subject: [PATCH 199/340] svm: Delete unused svm_get_unmapped_area ops ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4UR5Y CVE: NA -------------------- svm_get_unmapped_area will be no longer used. Signed-off-by: Lijun Fang Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- drivers/char/svm.c | 60 ---------------------------------------------- 1 file changed, 60 deletions(-) diff --git a/drivers/char/svm.c b/drivers/char/svm.c index 864cb79c6738..1f24328b3504 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -1685,65 +1685,6 @@ static int svm_proc_load_flag(int __user *arg) return put_user(flag, arg); } -static unsigned long svm_get_unmapped_area(struct file *file, - unsigned long addr0, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - unsigned long addr = addr0; - struct mm_struct *mm = current->mm; - struct vm_unmapped_area_info info; - struct svm_device *sdev = file_to_sdev(file); - - if (!acpi_disabled) - return -EPERM; - - if (flags & MAP_FIXED) { - if (IS_ALIGNED(addr, len)) - return addr; - - dev_err(sdev->dev, "MAP_FIXED but not aligned\n"); - return -EINVAL; //lint !e570 - } - - if (addr) { - struct vm_area_struct *vma = NULL; - - addr = ALIGN(addr, len); - - if (dvpp_mmap_check(addr, len, flags)) - return -ENOMEM; - - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (vma == NULL || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = ((mm->mmap_base <= DVPP_MMAP_BASE) ? - mm->mmap_base : DVPP_MMAP_BASE); - info.align_mask = ((len >> PAGE_SHIFT) - 1) << PAGE_SHIFT; - info.align_offset = pgoff << PAGE_SHIFT; - - addr = vm_unmapped_area(&info); - - if (offset_in_page(addr)) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = DVPP_MMAP_BASE; - - if (enable_mmap_dvpp) - dvpp_mmap_get_area(&info, flags); - - addr = vm_unmapped_area(&info); - } - - return addr; -} - static int svm_mmap(struct file *file, struct vm_area_struct *vma) { int err; @@ -1997,7 +1938,6 @@ static const struct file_operations svm_fops = { .owner = THIS_MODULE, .open = svm_open, .mmap = svm_mmap, - .get_unmapped_area = svm_get_unmapped_area, .unlocked_ioctl = svm_ioctl, }; -- Gitee From 8f2269877eef6b5a36c8109280ee5fd10dcaff51 Mon Sep 17 00:00:00 2001 From: Lijun Fang Date: Thu, 14 Apr 2022 09:33:51 +0000 Subject: [PATCH 200/340] svm: Change svm to modules ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4UR5Y CVE: NA -------------------- Change svm to modules by default. Signed-off-by: Lijun Fang Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- drivers/char/Kconfig | 4 ++-- drivers/char/svm.c | 36 +++++++++++++++++------------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 505562acf275..d6bb0a4bde56 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -553,9 +553,9 @@ config ADI driver include crash and makedumpfile. config HISI_SVM - bool "Hisilicon svm driver" + tristate "Hisilicon svm driver" depends on ARM64 && ARM_SMMU_V3 && MMU_NOTIFIER && HUGETLBFS - default y + default m help This driver provides character-level access to Hisilicon SVM chipset. Typically, you can bind a task to the diff --git a/drivers/char/svm.c b/drivers/char/svm.c index 1f24328b3504..9f98f57d562a 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -500,7 +500,7 @@ static int svm_remove_core(struct device *dev, void *data) { struct core_device *cdev = to_core_device(dev); - if (!cdev->smmu_bypass) { + if (!cdev->smmu_bypass && cdev->group && cdev->domain) { iommu_sva_device_shutdown(dev); iommu_detach_group(cdev->domain, cdev->group); iommu_group_put(cdev->group); @@ -928,11 +928,7 @@ static struct task_struct *svm_get_task(struct svm_bind_process params) if (params.flags & SVM_BIND_PID) { struct mm_struct *mm = NULL; - rcu_read_lock(); - task = find_task_by_vpid(params.vpid); - if (task) - get_task_struct(task); - rcu_read_unlock(); + task = find_get_task_by_vpid(params.vpid); if (task == NULL) return ERR_PTR(-ESRCH); @@ -1039,6 +1035,7 @@ static int svm_acpi_add_core(struct svm_device *sdev, struct core_device *cdev = NULL; char *name = NULL; enum dev_dma_attr attr; + const union acpi_object *obj; name = devm_kasprintf(sdev->dev, GFP_KERNEL, "svm_child_dev%d", id); if (name == NULL) @@ -1063,7 +1060,7 @@ static int svm_acpi_add_core(struct svm_device *sdev, return err; } - attr = acpi_get_dma_attr(children); + attr = device_get_dma_attr(&cdev->dev); if (attr != DEV_DMA_NOT_SUPPORTED) { err = acpi_dma_configure(&cdev->dev, attr); if (err) { @@ -1072,12 +1069,14 @@ static int svm_acpi_add_core(struct svm_device *sdev, } } - err = acpi_dev_prop_read_single(children, "hisi,smmu-bypass", - DEV_PROP_U8, &cdev->smmu_bypass); + err = acpi_dev_get_property(children, "hisi,smmu-bypass", + DEV_PROP_U8, &obj); if (err) { dev_info(&children->dev, "read smmu bypass failed\n"); } + cdev->smmu_bypass = (u8)obj->integer.value; + cdev->group = iommu_group_get(&cdev->dev); if (IS_ERR_OR_NULL(cdev->group)) { dev_err(&cdev->dev, "smmu is not right configured\n"); @@ -1296,7 +1295,7 @@ static pte_t *svm_get_pte(struct vm_area_struct *vma, if (is_vm_hugetlb_page(vma)) { if (pud_present(*pud)) { - if (pud_huge(*pud)) { + if (pud_val(*pud) && !(pud_val(*pud) & PUD_TABLE_BIT)) { pte = (pte_t *)pud; *offset = addr & (PUD_SIZE - 1); size = PUD_SIZE; @@ -1347,11 +1346,11 @@ static pte_t *svm_walk_pt(unsigned long addr, unsigned long *page_size, return NULL; pgd = pgd_offset(mm, addr); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd)) return NULL; pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) + if (pud_none(*pud)) return NULL; return svm_get_pte(vma, pud, addr, page_size, offset); @@ -1540,11 +1539,11 @@ static long svm_remap_get_phys(struct mm_struct *mm, struct vm_area_struct *vma, return err; pgd = pgd_offset(mm, addr); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd)) return err; pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) + if (pud_none(*pud)) return err; pte = svm_get_pte(vma, pud, addr, page_size, offset); @@ -1585,20 +1584,19 @@ static long svm_remap_proc(unsigned long __user *arg) return -EINVAL; } - rcu_read_lock(); if (pmem.pid) { - ptask = find_task_by_vpid(pmem.pid); + ptask = find_get_task_by_vpid(pmem.pid); if (!ptask) { - rcu_read_unlock(); pr_err("No task for this pid\n"); return -EINVAL; } } else { + rcu_read_lock(); ptask = current; + get_task_struct(ptask); + rcu_read_unlock(); } - get_task_struct(ptask); - rcu_read_unlock(); pmm = ptask->mm; down_read(&mm->mmap_sem); -- Gitee From 94e056c86418ba4118e1252b0ac3fa6e1ff0ea94 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 15 Apr 2022 14:44:05 +0800 Subject: [PATCH 201/340] binder: use euid from cred instead of using task stable inclusion from linux-4.19.218 commit 5d40061285b81a7e213dc9b37acc4a0545eedf32 -------------------------------- commit 29bc22ac5e5bc63275e850f0c8fc549e3d0e306b upstream. Save the 'struct cred' associated with a binder process at initial open to avoid potential race conditions when converting to an euid. Set a transaction's sender_euid from the 'struct cred' saved at binder_open() instead of looking up the euid from the binder proc's 'struct task'. This ensures the euid is associated with the security context that of the task that opened binder. Cc: stable@vger.kernel.org # 4.4+ Fixes: 457b9a6f09f0 ("Staging: android: add binder driver") Signed-off-by: Todd Kjos Suggested-by: Stephen Smalley Suggested-by: Jann Horn Acked-by: Casey Schaufler Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/android/binder.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 18c32637047f..3ba23d29f50a 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -483,6 +483,9 @@ enum binder_deferred_state { * @files files_struct for process * (protected by @files_lock) * @files_lock mutex to protect @files + * @cred struct cred associated with the `struct file` + * in binder_open() + * (invariant after initialized) * @deferred_work_node: element for binder_deferred_list * (protected by binder_deferred_lock) * @deferred_work: bitmap of deferred work to perform @@ -529,6 +532,7 @@ struct binder_proc { struct task_struct *tsk; struct files_struct *files; struct mutex files_lock; + const struct cred *cred; struct hlist_node deferred_work_node; int deferred_work; bool is_dead; @@ -2956,7 +2960,7 @@ static void binder_transaction(struct binder_proc *proc, t->from = thread; else t->from = NULL; - t->sender_euid = task_euid(proc->tsk); + t->sender_euid = proc->cred->euid; t->to_proc = target_proc; t->to_thread = target_thread; t->code = tr->code; @@ -4328,6 +4332,7 @@ static void binder_free_proc(struct binder_proc *proc) BUG_ON(!list_empty(&proc->delivered_death)); binder_alloc_deferred_release(&proc->alloc); put_task_struct(proc->tsk); + put_cred(proc->cred); binder_stats_deleted(BINDER_STAT_PROC); kfree(proc); } @@ -4786,6 +4791,7 @@ static int binder_open(struct inode *nodp, struct file *filp) get_task_struct(current->group_leader); proc->tsk = current->group_leader; mutex_init(&proc->files_lock); + proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); proc->default_priority = task_nice(current); binder_dev = container_of(filp->private_data, struct binder_device, -- Gitee From d1b9a67709f4a7ddeb01585718df846e08351977 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 15 Apr 2022 14:44:06 +0800 Subject: [PATCH 202/340] binder: use cred instead of task for selinux checks stable inclusion from linux-4.19.218 commit e82f3f9638f17d58e9a217bce127e2376aefcb9d -------------------------------- commit 52f88693378a58094c538662ba652aff0253c4fe upstream. Since binder was integrated with selinux, it has passed 'struct task_struct' associated with the binder_proc to represent the source and target of transactions. The conversion of task to SID was then done in the hook implementations. It turns out that there are race conditions which can result in an incorrect security context being used. Fix by using the 'struct cred' saved during binder_open and pass it to the selinux subsystem. Cc: stable@vger.kernel.org # 5.14 (need backport for earlier stables) Fixes: 79af73079d75 ("Add security hooks to binder and implement the hooks for SELinux.") Suggested-by: Jann Horn Signed-off-by: Todd Kjos Acked-by: Casey Schaufler Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman conflicts: drivers/android/binder.c Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/android/binder.c | 12 ++++++------ include/linux/lsm_hooks.h | 28 ++++++++++++++-------------- include/linux/security.h | 28 ++++++++++++++-------------- security/security.c | 14 +++++++------- security/selinux/hooks.c | 36 +++++++++++++++--------------------- 5 files changed, 56 insertions(+), 62 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 3ba23d29f50a..10735051785d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2337,7 +2337,7 @@ static int binder_translate_binder(struct flat_binder_object *fp, ret = -EINVAL; goto done; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } @@ -2383,7 +2383,7 @@ static int binder_translate_handle(struct flat_binder_object *fp, proc->pid, thread->pid, fp->handle); return -EINVAL; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } @@ -2467,7 +2467,7 @@ static int binder_translate_fd(int fd, ret = -EBADF; goto err_fget; } - ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file); + ret = security_binder_transfer_file(proc->cred, target_proc->cred, file); if (ret < 0) { ret = -EPERM; goto err_security; @@ -2845,8 +2845,8 @@ static void binder_transaction(struct binder_proc *proc, goto err_dead_binder; } e->to_node = target_node->debug_id; - if (security_binder_transaction(proc->tsk, - target_proc->tsk) < 0) { + if (security_binder_transaction(proc->cred, + target_proc->cred) < 0) { return_error = BR_FAILED_REPLY; return_error_param = -EPERM; return_error_line = __LINE__; @@ -4536,7 +4536,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) ret = -EBUSY; goto out; } - ret = security_binder_set_context_mgr(proc->tsk); + ret = security_binder_set_context_mgr(proc->cred); if (ret < 0) goto out; if (uid_valid(context->binder_context_mgr_uid)) { diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 97a020c616ad..d17ea5358368 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1211,22 +1211,22 @@ * * @binder_set_context_mgr: * Check whether @mgr is allowed to be the binder context manager. - * @mgr contains the task_struct for the task being registered. + * @mgr contains the struct cred for the current binder process. * Return 0 if permission is granted. * @binder_transaction: * Check whether @from is allowed to invoke a binder transaction call * to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_binder: * Check whether @from is allowed to transfer a binder reference to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_file: * Check whether @from is allowed to transfer @file to @to. - * @from contains the task_struct for the sending task. + * @from contains the struct cred for the sending process. * @file contains the struct file being transferred. - * @to contains the task_struct for the receiving task. + * @to contains the struct cred for the receiving process. * * @ptrace_access_check: * Check permission before allowing the current process to trace the @@ -1428,13 +1428,13 @@ * */ union security_list_options { - int (*binder_set_context_mgr)(struct task_struct *mgr); - int (*binder_transaction)(struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_binder)(struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_file)(struct task_struct *from, - struct task_struct *to, + int (*binder_set_context_mgr)(const struct cred *mgr); + int (*binder_transaction)(const struct cred *from, + const struct cred *to); + int (*binder_transfer_binder)(const struct cred *from, + const struct cred *to); + int (*binder_transfer_file)(const struct cred *from, + const struct cred *to, struct file *file); int (*ptrace_access_check)(struct task_struct *child, diff --git a/include/linux/security.h b/include/linux/security.h index 75f4156c84d7..9eb1c7a0f280 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -216,13 +216,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) extern int security_init(void); /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr); -int security_binder_transaction(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file); +int security_binder_set_context_mgr(const struct cred *mgr); +int security_binder_transaction(const struct cred *from, + const struct cred *to); +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to); +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file); int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, @@ -439,25 +439,25 @@ static inline int security_init(void) return 0; } -static inline int security_binder_set_context_mgr(struct task_struct *mgr) +static inline int security_binder_set_context_mgr(const struct cred *mgr) { return 0; } -static inline int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transaction(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, +static inline int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { return 0; diff --git a/security/security.c b/security/security.c index 9e4d6c999c79..8a9e1ececd7d 100644 --- a/security/security.c +++ b/security/security.c @@ -232,25 +232,25 @@ EXPORT_SYMBOL(unregister_lsm_notifier); /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr) +int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, 0, mgr); } -int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +int security_binder_transaction(const struct cred *from, + const struct cred *to) { return call_int_hook(binder_transaction, 0, from, to); } -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return call_int_hook(binder_transfer_binder, 0, from, to); } -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file) +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { return call_int_hook(binder_transfer_file, 0, from, to, file); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 25353b4975c9..056f5de53e7e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2218,22 +2218,19 @@ static inline u32 open_file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_binder_set_context_mgr(struct task_struct *mgr) +static int selinux_binder_set_context_mgr(const struct cred *mgr) { - u32 mysid = current_sid(); - u32 mgrsid = task_sid(mgr); - return avc_has_perm(&selinux_state, - mysid, mgrsid, SECCLASS_BINDER, + current_sid(), cred_sid(mgr), SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } -static int selinux_binder_transaction(struct task_struct *from, - struct task_struct *to) +static int selinux_binder_transaction(const struct cred *from, + const struct cred *to) { u32 mysid = current_sid(); - u32 fromsid = task_sid(from); - u32 tosid = task_sid(to); + u32 fromsid = cred_sid(from); + u32 tosid = cred_sid(to); int rc; if (mysid != fromsid) { @@ -2244,27 +2241,24 @@ static int selinux_binder_transaction(struct task_struct *from, return rc; } - return avc_has_perm(&selinux_state, - fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, - NULL); + return avc_has_perm(&selinux_state, fromsid, tosid, + SECCLASS_BINDER, BINDER__CALL, NULL); } -static int selinux_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static int selinux_binder_transfer_binder(const struct cred *from, + const struct cred *to) { - u32 fromsid = task_sid(from); - u32 tosid = task_sid(to); - return avc_has_perm(&selinux_state, - fromsid, tosid, SECCLASS_BINDER, BINDER__TRANSFER, + cred_sid(from), cred_sid(to), + SECCLASS_BINDER, BINDER__TRANSFER, NULL); } -static int selinux_binder_transfer_file(struct task_struct *from, - struct task_struct *to, +static int selinux_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { - u32 sid = task_sid(to); + u32 sid = cred_sid(to); struct file_security_struct *fsec = file->f_security; struct dentry *dentry = file->f_path.dentry; struct inode_security_struct *isec; -- Gitee From a88267d538c01e97ba50921983de6afdc9bf4b1c Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 15 Apr 2022 14:44:07 +0800 Subject: [PATCH 203/340] binder: fix test regression due to sender_euid change stable inclusion from linux-4.19.219 commit c3b9f29fca6682550d731c80745b421415c1e0af -------------------------------- commit c21a80ca0684ec2910344d72556c816cb8940c01 upstream. This is a partial revert of commit 29bc22ac5e5b ("binder: use euid from cred instead of using task"). Setting sender_euid using proc->cred caused some Android system test regressions that need further investigation. It is a partial reversion because subsequent patches rely on proc->cred. Fixes: 29bc22ac5e5b ("binder: use euid from cred instead of using task") Cc: stable@vger.kernel.org # 4.4+ Acked-by: Christian Brauner Signed-off-by: Todd Kjos Change-Id: I9b1769a3510fed250bb21859ef8beebabe034c66 Link: https://lore.kernel.org/r/20211112180720.2858135-1-tkjos@google.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 10735051785d..0654133efbe4 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2960,7 +2960,7 @@ static void binder_transaction(struct binder_proc *proc, t->from = thread; else t->from = NULL; - t->sender_euid = proc->cred->euid; + t->sender_euid = task_euid(proc->tsk); t->to_proc = target_proc; t->to_thread = target_thread; t->code = tr->code; -- Gitee From 4fd62a05a608e8c50258f6eafbc6f852def5c52c Mon Sep 17 00:00:00 2001 From: Simon Leiner Date: Fri, 15 Apr 2022 14:44:08 +0800 Subject: [PATCH 204/340] xen/xenbus: Fix granting of vmalloc'd memory stable inclusion from linux-4.19.144 commit 47eb291ba65bfade197e73ee13610d97809cb087 -------------------------------- [ Upstream commit d742db70033c745e410523e00522ee0cfe2aa416 ] On some architectures (like ARM), virt_to_gfn cannot be used for vmalloc'd memory because of its reliance on virt_to_phys. This patch introduces a check for vmalloc'd addresses and obtains the PFN using vmalloc_to_pfn in that case. Signed-off-by: Simon Leiner Reviewed-by: Stefano Stabellini Link: https://lore.kernel.org/r/20200825093153.35500-1-simon@leiner.me Signed-off-by: Juergen Gross Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/xen/xenbus/xenbus_client.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 6f6f6db94838..9955892d1ea6 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -371,8 +371,14 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, int i, j; for (i = 0; i < nr_pages; i++) { - err = gnttab_grant_foreign_access(dev->otherend_id, - virt_to_gfn(vaddr), 0); + unsigned long gfn; + + if (is_vmalloc_addr(vaddr)) + gfn = pfn_to_gfn(vmalloc_to_pfn(vaddr)); + else + gfn = virt_to_gfn(vaddr); + + err = gnttab_grant_foreign_access(dev->otherend_id, gfn, 0); if (err < 0) { xenbus_dev_fatal(dev, err, "granting access to ring page"); -- Gitee From 85586d4f7759da6b1903bfba4b7cd718995f4aca Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:09 +0800 Subject: [PATCH 205/340] xen/xenbus: don't let xenbus_grant_ring() remove grants in error case stable inclusion from linux-4.19.234 commit 8d521d960aef22781ff499e16899c30af899de8d -------------------------------- Commit 3777ea7bac3113005b7180e6b9dadf16d19a5827 upstream. Letting xenbus_grant_ring() tear down grants in the error case is problematic, as the other side could already have used these grants. Calling gnttab_end_foreign_access_ref() without checking success is resulting in an unclear situation for any caller of xenbus_grant_ring() as in the error case the memory pages of the ring page might be partially mapped. Freeing them would risk unwanted foreign access to them, while not freeing them would leak memory. In order to remove the need to undo any gnttab_grant_foreign_access() calls, use gnttab_alloc_grant_references() to make sure no further error can occur in the loop granting access to the ring pages. It should be noted that this way of handling removes leaking of grant entries in the error case, too. This is CVE-2022-23040 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/xen/xenbus/xenbus_client.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 9955892d1ea6..365cf9f5c967 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -368,7 +368,14 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, unsigned int nr_pages, grant_ref_t *grefs) { int err; - int i, j; + unsigned int i; + grant_ref_t gref_head; + + err = gnttab_alloc_grant_references(nr_pages, &gref_head); + if (err) { + xenbus_dev_fatal(dev, err, "granting access to ring page"); + return err; + } for (i = 0; i < nr_pages; i++) { unsigned long gfn; @@ -378,23 +385,14 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, else gfn = virt_to_gfn(vaddr); - err = gnttab_grant_foreign_access(dev->otherend_id, gfn, 0); - if (err < 0) { - xenbus_dev_fatal(dev, err, - "granting access to ring page"); - goto fail; - } - grefs[i] = err; + grefs[i] = gnttab_claim_grant_reference(&gref_head); + gnttab_grant_foreign_access_ref(grefs[i], dev->otherend_id, + gfn, 0); vaddr = vaddr + XEN_PAGE_SIZE; } return 0; - -fail: - for (j = 0; j < i; j++) - gnttab_end_foreign_access_ref(grefs[j], 0); - return err; } EXPORT_SYMBOL_GPL(xenbus_grant_ring); -- Gitee From efdc8ee28b9e821948ce8f7afb0047f9358b57c5 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:10 +0800 Subject: [PATCH 206/340] xen/grant-table: add gnttab_try_end_foreign_access() stable inclusion from linux-4.19.234 commit 17659846fe336366b1663194f5669d10f5947f53 -------------------------------- Commit 6b1775f26a2da2b05a6dc8ec2b5d14e9a4701a1a upstream. Add a new grant table function gnttab_try_end_foreign_access(), which will remove and free a grant if it is not in use. Its main use case is to either free a grant if it is no longer in use, or to take some other action if it is still in use. This other action can be an error exit, or (e.g. in the case of blkfront persistent grant feature) some special handling. This is CVE-2022-23036, CVE-2022-23038 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/xen/grant-table.c | 14 ++++++++++++-- include/xen/grant_table.h | 12 ++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 97341fa75458..9524806eb4b9 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -436,11 +436,21 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly, what, ref, page ? page_to_pfn(page) : -1); } +int gnttab_try_end_foreign_access(grant_ref_t ref) +{ + int ret = _gnttab_end_foreign_access_ref(ref, 0); + + if (ret) + put_free_entry(ref); + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access); + void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page) { - if (gnttab_end_foreign_access_ref(ref, readonly)) { - put_free_entry(ref); + if (gnttab_try_end_foreign_access(ref)) { if (page != 0) put_page(virt_to_page(page)); } else diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index a9978350b45b..7628ab25f686 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -97,10 +97,22 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * access has been ended, free the given page too. Access will be ended * immediately iff the grant entry is not in use, otherwise it will happen * some time later. page may be 0, in which case no freeing will occur. + * Note that the granted page might still be accessed (read or write) by the + * other side after gnttab_end_foreign_access() returns, so even if page was + * specified as 0 it is not allowed to just reuse the page for other + * purposes immediately. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); +/* + * End access through the given grant reference, iff the grant entry is + * no longer in use. In case of success ending foreign access, the + * grant reference is deallocated. + * Return 1 if the grant entry was freed, 0 if it is still in use. + */ +int gnttab_try_end_foreign_access(grant_ref_t ref); + int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); -- Gitee From 3b5b044ddd9697f6f52d3e96612248c11db48684 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:11 +0800 Subject: [PATCH 207/340] xen/blkfront: don't use gnttab_query_foreign_access() for mapped status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from linux-4.19.234 commit 423a3a50dce9a48d10d2d2a70cd2f78064c13703 -------------------------------- Commit abf1fd5919d6238ee3bc5eb4a9b6c3947caa6638 upstream. It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_end_foreign_access_ref() and check the success of that operation instead. For the ring allocation use alloc_pages_exact() in order to avoid high order pages in case of a multi-page ring. If a grant wasn't unmapped by the backend without persistent grants being used, set the device state to "error". This is CVE-2022-23036 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/block/xen-blkfront.c | 63 +++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 4e52917ebd31..0e451b17f33a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1342,7 +1342,8 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * XEN_PAGE_SIZE)); + free_pages_exact(rinfo->ring.sring, + info->nr_ring_pages * XEN_PAGE_SIZE); rinfo->ring.sring = NULL; if (rinfo->irq) @@ -1426,9 +1427,15 @@ static int blkif_get_final_status(enum blk_req_status s1, return BLKIF_RSP_OKAY; } -static bool blkif_completion(unsigned long *id, - struct blkfront_ring_info *rinfo, - struct blkif_response *bret) +/* + * Return values: + * 1 response processed. + * 0 missing further responses. + * -1 error while processing. + */ +static int blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, + struct blkif_response *bret) { int i = 0; struct scatterlist *sg; @@ -1451,7 +1458,7 @@ static bool blkif_completion(unsigned long *id, /* Wait the second response if not yet here. */ if (s2->status < REQ_DONE) - return false; + return 0; bret->status = blkif_get_final_status(s->status, s2->status); @@ -1502,42 +1509,43 @@ static bool blkif_completion(unsigned long *id, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < num_grant; i++) { - if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the * backend has chosen to make this grant persistent) * we add it at the head of the list, so it will be * reused first. */ - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->grants_used[i]->gref); + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + return -1; + } list_add(&s->grants_used[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { /* - * If the grant is not mapped by the backend we end the - * foreign access and add it to the tail of the list, - * so it will not be picked again unless we run out of - * persistent grants. + * If the grant is not mapped by the backend we add it + * to the tail of the list, so it will not be picked + * again unless we run out of persistent grants. */ - gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { - if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->indirect_grants[i]->gref); + if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + return -1; + } list_add(&s->indirect_grants[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { struct page *indirect_page; - gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); /* * Add the used indirect page back to the list of * available pages for indirect grefs. @@ -1552,7 +1560,7 @@ static bool blkif_completion(unsigned long *id, } } - return true; + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1618,12 +1626,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } if (bret.operation != BLKIF_OP_DISCARD) { + int ret; + /* * We may need to wait for an extra response if the * I/O request is split in 2 */ - if (!blkif_completion(&id, rinfo, &bret)) + ret = blkif_completion(&id, rinfo, &bret); + if (!ret) continue; + if (unlikely(ret < 0)) + goto err; } if (add_id_to_freelist(rinfo, id)) { @@ -1729,8 +1742,7 @@ static int setup_blkring(struct xenbus_device *dev, for (i = 0; i < info->nr_ring_pages; i++) rinfo->ring_ref[i] = GRANT_INVALID_REF; - sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, - get_order(ring_size)); + sring = alloc_pages_exact(ring_size, GFP_NOIO); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; @@ -1740,7 +1752,7 @@ static int setup_blkring(struct xenbus_device *dev, err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { - free_pages((unsigned long)sring, get_order(ring_size)); + free_pages_exact(sring, ring_size); rinfo->ring.sring = NULL; goto fail; } @@ -2719,11 +2731,10 @@ static void purge_persistent_grants(struct blkfront_info *info) list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants, node) { if (gnt_list_entry->gref == GRANT_INVALID_REF || - gnttab_query_foreign_access(gnt_list_entry->gref)) + !gnttab_try_end_foreign_access(gnt_list_entry->gref)) continue; list_del(&gnt_list_entry->node); - gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); rinfo->persistent_gnts_c--; gnt_list_entry->gref = GRANT_INVALID_REF; list_add_tail(&gnt_list_entry->node, &rinfo->grants); -- Gitee From e63e775c3c3f738ff9aa055519e6ccea6ed46583 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:12 +0800 Subject: [PATCH 208/340] xen/netfront: don't use gnttab_query_foreign_access() for mapped status stable inclusion from linux-4.19.234 commit 927e4eb8ddf4968b6a33be992b28063f84552c72 -------------------------------- Commit 31185df7e2b1d2fa1de4900247a12d7b9c7087eb upstream. It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_end_foreign_access_ref() and check the success of that operation instead. This is CVE-2022-23037 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/net/xen-netfront.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index da1b93f98cd1..2c879f07d3f2 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -412,14 +412,12 @@ static bool xennet_tx_buf_gc(struct netfront_queue *queue) queue->tx_link[id] = TX_LINK_NONE; skb = queue->tx_skbs[id]; queue->tx_skbs[id] = NULL; - if (unlikely(gnttab_query_foreign_access( - queue->grant_tx_ref[id]) != 0)) { + if (unlikely(!gnttab_end_foreign_access_ref( + queue->grant_tx_ref[id], GNTMAP_readonly))) { dev_alert(dev, "Grant still in use by backend domain\n"); goto err; } - gnttab_end_foreign_access_ref( - queue->grant_tx_ref[id], GNTMAP_readonly); gnttab_release_grant_reference( &queue->gref_tx_head, queue->grant_tx_ref[id]); queue->grant_tx_ref[id] = GRANT_INVALID_REF; -- Gitee From 4ba1e7c05179c9554e9e3e328163fe7ec375c8d0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:13 +0800 Subject: [PATCH 209/340] xen/scsifront: don't use gnttab_query_foreign_access() for mapped status stable inclusion from linux-4.19.234 commit 62a696c15cfcfd32527f81ca3d94f2bde57475dc -------------------------------- Commit 33172ab50a53578a95691310f49567c9266968b0 upstream. It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_try_end_foreign_access() and check the success of that operation instead. This is CVE-2022-23038 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/scsi/xen-scsifront.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index 61389bdc7926..a1d822ae9fca 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -233,12 +233,11 @@ static void scsifront_gnttab_done(struct vscsifrnt_info *info, return; for (i = 0; i < shadow->nr_grants; i++) { - if (unlikely(gnttab_query_foreign_access(shadow->gref[i]))) { + if (unlikely(!gnttab_try_end_foreign_access(shadow->gref[i]))) { shost_printk(KERN_ALERT, info->host, KBUILD_MODNAME "grant still in use by backend\n"); BUG(); } - gnttab_end_foreign_access(shadow->gref[i], 0, 0UL); } kfree(shadow->sg); -- Gitee From f0006464201fe2521cb18fcbc04b7fad458ea27f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:14 +0800 Subject: [PATCH 210/340] xen/gntalloc: don't use gnttab_query_foreign_access() stable inclusion from linux-4.19.234 commit fbc57368ea527dcfa909908fc47a851a56e4e5ce -------------------------------- Commit d3b6372c5881cb54925212abb62c521df8ba4809 upstream. Using gnttab_query_foreign_access() is unsafe, as it is racy by design. The use case in the gntalloc driver is not needed at all. While at it replace the call of gnttab_end_foreign_access_ref() with a call of gnttab_end_foreign_access(), which is what is really wanted there. In case the grant wasn't used due to an allocation failure, just free the grant via gnttab_free_grant_reference(). This is CVE-2022-23039 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/xen/gntalloc.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 3fa40c723e8e..edb0acd0b832 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -169,20 +169,14 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, __del_gref(gref); } - /* It's possible for the target domain to map the just-allocated grant - * references by blindly guessing their IDs; if this is done, then - * __del_gref will leave them in the queue_gref list. They need to be - * added to the global list so that we can free them when they are no - * longer referenced. - */ - if (unlikely(!list_empty(&queue_gref))) - list_splice_tail(&queue_gref, &gref_list); mutex_unlock(&gref_mutex); return rc; } static void __del_gref(struct gntalloc_gref *gref) { + unsigned long addr; + if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { uint8_t *tmp = kmap(gref->page); tmp[gref->notify.pgoff] = 0; @@ -196,21 +190,16 @@ static void __del_gref(struct gntalloc_gref *gref) gref->notify.flags = 0; if (gref->gref_id) { - if (gnttab_query_foreign_access(gref->gref_id)) - return; - - if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) - return; - - gnttab_free_grant_reference(gref->gref_id); + if (gref->page) { + addr = (unsigned long)page_to_virt(gref->page); + gnttab_end_foreign_access(gref->gref_id, 0, addr); + } else + gnttab_free_grant_reference(gref->gref_id); } gref_size--; list_del(&gref->next_gref); - if (gref->page) - __free_page(gref->page); - kfree(gref); } -- Gitee From c16dd2ae5e63d66498b27028b46bec3c99960a83 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:15 +0800 Subject: [PATCH 211/340] xen: remove gnttab_query_foreign_access() stable inclusion from linux-4.19.234 commit c900f34fc134cc75de431e16546f37bf7804a012 -------------------------------- Commit 1dbd11ca75fe664d3e54607547771d021f531f59 upstream. Remove gnttab_query_foreign_access(), as it is unused and unsafe to use. All previous use cases assumed a grant would not be in use after gnttab_query_foreign_access() returned 0. This information is useless in best case, as it only refers to a situation in the past, which could have changed already. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/xen/grant-table.c | 25 ------------------------- include/xen/grant_table.h | 2 -- 2 files changed, 27 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 9524806eb4b9..1e785e65249a 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -134,13 +134,6 @@ struct gnttab_ops { * return the frame. */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); - /* - * Query the status of a grant entry. Ref parameter is reference of - * queried grant entry, return value is the status of queried entry. - * Detailed status(writing/reading) can be gotten from the return value - * by bit operations. - */ - int (*query_foreign_access)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -285,22 +278,6 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -static int gnttab_query_foreign_access_v1(grant_ref_t ref) -{ - return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); -} - -static int gnttab_query_foreign_access_v2(grant_ref_t ref) -{ - return grstatus[ref] & (GTF_reading|GTF_writing); -} - -int gnttab_query_foreign_access(grant_ref_t ref) -{ - return gnttab_interface->query_foreign_access(ref); -} -EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); - static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) { u16 flags, nflags; @@ -1307,7 +1284,6 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, - .query_foreign_access = gnttab_query_foreign_access_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1319,7 +1295,6 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, - .query_foreign_access = gnttab_query_foreign_access_v2, }; static bool gnttab_need_v2(void) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 7628ab25f686..7087c9fea80f 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -118,8 +118,6 @@ int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); -int gnttab_query_foreign_access(grant_ref_t ref); - /* * operations on reserved batches of grant references */ -- Gitee From b0e5fec4687913b851c32f461d335e3c2130f39c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:16 +0800 Subject: [PATCH 212/340] xen/9p: use alloc/free_pages_exact() stable inclusion from linux-4.19.234 commit 2466bed361f3274e3e0ca9d8e539532481c06fea -------------------------------- Commit 5cadd4bb1d7fc9ab201ac14620d1a478357e4ebd upstream. Instead of __get_free_pages() and free_pages() use alloc_pages_exact() and free_pages_exact(). This is in preparation of a change of gnttab_end_foreign_access() which will prohibit use of high-order pages. By using the local variable "order" instead of ring->intf->ring_order in the error path of xen_9pfs_front_alloc_dataring() another bug is fixed, as the error path can be entered before ring->intf->ring_order is being set. By using alloc_pages_exact() the size in bytes is specified for the allocation, which fixes another bug for the case of order < (PAGE_SHIFT - XEN_PAGE_SHIFT). This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- net/9p/trans_xen.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 9daab0dd833b..81002cba88b3 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -301,9 +301,9 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) ref = priv->rings[i].intf->ref[j]; gnttab_end_foreign_access(ref, 0, 0); } - free_pages((unsigned long)priv->rings[i].data.in, - XEN_9PFS_RING_ORDER - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(priv->rings[i].data.in, + 1UL << (XEN_9PFS_RING_ORDER + + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); free_page((unsigned long)priv->rings[i].intf); @@ -341,8 +341,8 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, if (ret < 0) goto out; ring->ref = ret; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - XEN_9PFS_RING_ORDER - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + bytes = alloc_pages_exact(1UL << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT), + GFP_KERNEL | __GFP_ZERO); if (!bytes) { ret = -ENOMEM; goto out; @@ -373,9 +373,7 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, if (bytes) { for (i--; i >= 0; i--) gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); - free_pages((unsigned long)bytes, - XEN_9PFS_RING_ORDER - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(bytes, 1UL << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(ring->ref, 0, 0); free_page((unsigned long)ring->intf); -- Gitee From 6b310e722622a233becab4088711e57fb33a992a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:17 +0800 Subject: [PATCH 213/340] xen/pvcalls: use alloc/free_pages_exact() stable inclusion from linux-4.19.234 commit f85d03f0f482cc28a2ee15a1fed2ae57ae359412 -------------------------------- Commit b0576cc9c6b843d99c6982888d59a56209341888 upstream. Instead of __get_free_pages() and free_pages() use alloc_pages_exact() and free_pages_exact(). This is in preparation of a change of gnttab_end_foreign_access() which will prohibit use of high-order pages. This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/xen/pvcalls-front.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index d7438fdc5706..285bb8390a97 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -346,8 +346,8 @@ static void free_active_ring(struct sock_mapping *map) if (!map->active.ring) return; - free_pages((unsigned long)map->active.data.in, - map->active.ring->ring_order); + free_pages_exact(map->active.data.in, + PAGE_SIZE << map->active.ring->ring_order); free_page((unsigned long)map->active.ring); } @@ -361,8 +361,8 @@ static int alloc_active_ring(struct sock_mapping *map) goto out; map->active.ring->ring_order = PVCALLS_RING_ORDER; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PVCALLS_RING_ORDER); + bytes = alloc_pages_exact(PAGE_SIZE << PVCALLS_RING_ORDER, + GFP_KERNEL | __GFP_ZERO); if (!bytes) goto out; -- Gitee From ca1d0828717a0994f4ef1e5fd00ab4514f963c81 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:18 +0800 Subject: [PATCH 214/340] xen/gnttab: fix gnttab_end_foreign_access() without page specified stable inclusion from linux-4.19.234 commit 92dc0e4a219602242407dedd987dc9c8263c959b -------------------------------- Commit 42baefac638f06314298087394b982ead9ec444b upstream. gnttab_end_foreign_access() is used to free a grant reference and optionally to free the associated page. In case the grant is still in use by the other side processing is being deferred. This leads to a problem in case no page to be freed is specified by the caller: the caller doesn't know that the page is still mapped by the other side and thus should not be used for other purposes. The correct way to handle this situation is to take an additional reference to the granted page in case handling is being deferred and to drop that reference when the grant reference could be freed finally. This requires that there are no users of gnttab_end_foreign_access() left directly repurposing the granted page after the call, as this might result in clobbered data or information leaks via the not yet freed grant reference. This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/xen/grant-table.c | 36 +++++++++++++++++++++++++++++------- include/xen/grant_table.h | 7 ++++++- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 1e785e65249a..e434a583ee7e 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -134,6 +134,10 @@ struct gnttab_ops { * return the frame. */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); + /* + * Read the frame number related to a given grant reference. + */ + unsigned long (*read_frame)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -331,6 +335,16 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +static unsigned long gnttab_read_frame_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].frame; +} + +static unsigned long gnttab_read_frame_v2(grant_ref_t ref) +{ + return gnttab_shared.v2[ref].full_page.frame; +} + struct deferred_entry { struct list_head list; grant_ref_t ref; @@ -360,12 +374,9 @@ static void gnttab_handle_deferred(struct timer_list *unused) spin_unlock_irqrestore(&gnttab_list_lock, flags); if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { put_free_entry(entry->ref); - if (entry->page) { - pr_debug("freeing g.e. %#x (pfn %#lx)\n", - entry->ref, page_to_pfn(entry->page)); - put_page(entry->page); - } else - pr_info("freeing g.e. %#x\n", entry->ref); + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + put_page(entry->page); kfree(entry); entry = NULL; } else { @@ -390,9 +401,18 @@ static void gnttab_handle_deferred(struct timer_list *unused) static void gnttab_add_deferred(grant_ref_t ref, bool readonly, struct page *page) { - struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + struct deferred_entry *entry; + gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; const char *what = KERN_WARNING "leaking"; + entry = kmalloc(sizeof(*entry), gfp); + if (!page) { + unsigned long gfn = gnttab_interface->read_frame(ref); + + page = pfn_to_page(gfn_to_pfn(gfn)); + get_page(page); + } + if (entry) { unsigned long flags; @@ -1284,6 +1304,7 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, + .read_frame = gnttab_read_frame_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1295,6 +1316,7 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, + .read_frame = gnttab_read_frame_v2, }; static bool gnttab_need_v2(void) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 7087c9fea80f..a58a89cc0e97 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -100,7 +100,12 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * Note that the granted page might still be accessed (read or write) by the * other side after gnttab_end_foreign_access() returns, so even if page was * specified as 0 it is not allowed to just reuse the page for other - * purposes immediately. + * purposes immediately. gnttab_end_foreign_access() will take an additional + * reference to the granted page in this case, which is dropped only after + * the grant is no longer in use. + * This requires that multi page allocations for areas subject to + * gnttab_end_foreign_access() are done via alloc_pages_exact() (and freeing + * via free_pages_exact()) in order to avoid high order pages. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); -- Gitee From 5dc640838ff5d519be706b06b977fa7c66ddf264 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 15 Apr 2022 14:44:19 +0800 Subject: [PATCH 215/340] xen/netfront: react properly to failing gnttab_end_foreign_access_ref() stable inclusion from linux-4.19.234 commit c307029d811e03546d18d0e512fe295b3103b8e5 -------------------------------- Commit 66e3531b33ee51dad17c463b4d9c9f52e341503d upstream. When calling gnttab_end_foreign_access_ref() the returned value must be tested and the reaction to that value should be appropriate. In case of failure in xennet_get_responses() the reaction should not be to crash the system, but to disable the network device. The calls in setup_netfront() can be replaced by calls of gnttab_end_foreign_access(). While at it avoid double free of ring pages and grant references via xennet_disconnect_backend() in this case. This is CVE-2022-23042 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Xiu Jianfeng Reviewed-by: Jason Yan Signed-off-by: Laibin Qiu --- drivers/net/xen-netfront.c | 48 ++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 2c879f07d3f2..376024c26270 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -838,7 +838,6 @@ static int xennet_get_responses(struct netfront_queue *queue, int max = XEN_NETIF_NR_SLOTS_MIN + (rx->status <= RX_COPY_THRESHOLD); int slots = 1; int err = 0; - unsigned long ret; if (rx->flags & XEN_NETRXF_extra_info) { err = xennet_get_extras(queue, extras, rp); @@ -869,8 +868,13 @@ static int xennet_get_responses(struct netfront_queue *queue, goto next; } - ret = gnttab_end_foreign_access_ref(ref, 0); - BUG_ON(!ret); + if (!gnttab_end_foreign_access_ref(ref, 0)) { + dev_alert(dev, + "Grant still in use by backend domain\n"); + queue->info->broken = true; + dev_alert(dev, "Disabled for further use\n"); + return -EINVAL; + } gnttab_release_grant_reference(&queue->gref_rx_head, ref); @@ -1074,6 +1078,10 @@ static int xennet_poll(struct napi_struct *napi, int budget) err = xennet_get_responses(queue, &rinfo, rp, &tmpq); if (unlikely(err)) { + if (queue->info->broken) { + spin_unlock(&queue->rx_lock); + return 0; + } err: while ((skb = __skb_dequeue(&tmpq))) __skb_queue_tail(&errq, skb); @@ -1644,7 +1652,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_queue *queue, unsigned int feature_split_evtchn) { struct xen_netif_tx_sring *txs; - struct xen_netif_rx_sring *rxs; + struct xen_netif_rx_sring *rxs = NULL; grant_ref_t gref; int err; @@ -1664,21 +1672,21 @@ static int setup_netfront(struct xenbus_device *dev, err = xenbus_grant_ring(dev, txs, 1, &gref); if (err < 0) - goto grant_tx_ring_fail; + goto fail; queue->tx_ring_ref = gref; rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); if (!rxs) { err = -ENOMEM; xenbus_dev_fatal(dev, err, "allocating rx ring page"); - goto alloc_rx_ring_fail; + goto fail; } SHARED_RING_INIT(rxs); FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); err = xenbus_grant_ring(dev, rxs, 1, &gref); if (err < 0) - goto grant_rx_ring_fail; + goto fail; queue->rx_ring_ref = gref; if (feature_split_evtchn) @@ -1691,22 +1699,28 @@ static int setup_netfront(struct xenbus_device *dev, err = setup_netfront_single(queue); if (err) - goto alloc_evtchn_fail; + goto fail; return 0; /* If we fail to setup netfront, it is safe to just revoke access to * granted pages because backend is not accessing it at this point. */ -alloc_evtchn_fail: - gnttab_end_foreign_access_ref(queue->rx_ring_ref, 0); -grant_rx_ring_fail: - free_page((unsigned long)rxs); -alloc_rx_ring_fail: - gnttab_end_foreign_access_ref(queue->tx_ring_ref, 0); -grant_tx_ring_fail: - free_page((unsigned long)txs); -fail: + fail: + if (queue->rx_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(queue->rx_ring_ref, 0, + (unsigned long)rxs); + queue->rx_ring_ref = GRANT_INVALID_REF; + } else { + free_page((unsigned long)rxs); + } + if (queue->tx_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(queue->tx_ring_ref, 0, + (unsigned long)txs); + queue->tx_ring_ref = GRANT_INVALID_REF; + } else { + free_page((unsigned long)txs); + } return err; } -- Gitee From dd9f1295656568a2d1afe89a375c59afa79cb186 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Sat, 16 Apr 2022 07:27:52 +0000 Subject: [PATCH 216/340] net: core: netlink: add helper refcount dec and lock function stable inclusion from linux-4.19.221 commit cd25f1099284a0cbe916344fc1e6c1ffed6c5306 category: bugfix CVE: CVE-2021-39713 -------------------------------- [ Upstream commit 6f99528e9797794b91b43321fbbc93fe772b0803 ] Rtnl lock is encapsulated in netlink and cannot be accessed by other modules directly. This means that reference counted objects that rely on rtnl lock cannot use it with refcounter helper function that atomically releases decrements reference and obtains mutex. This patch implements simple wrapper function around refcount_dec_and_lock that obtains rtnl lock if reference counter value reached 0. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller [Lee: Sent to Stable] Link: https://syzkaller.appspot.com/bug?id=d7e411c5472dd5da33d8cc921ccadc747743a568 Reported-by: syzbot+5f229e48cccc804062c0@syzkaller.appspotmail.com Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zhengchao Shao Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- include/linux/rtnetlink.h | 2 ++ net/core/rtnetlink.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 5225832bd6ff..9cdd76348d9a 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -6,6 +6,7 @@ #include #include #include +#include #include extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); @@ -34,6 +35,7 @@ extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); +extern bool refcount_dec_and_rtnl_lock(refcount_t *r); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6cd0dbe42baa..2837cc03f69e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -130,6 +130,12 @@ int rtnl_is_locked(void) } EXPORT_SYMBOL(rtnl_is_locked); +bool refcount_dec_and_rtnl_lock(refcount_t *r) +{ + return refcount_dec_and_mutex_lock(r, &rtnl_mutex); +} +EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); + #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { -- Gitee From 5a1704c7f3c1f3499716b7f440efa002f6724a35 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Sat, 16 Apr 2022 07:27:53 +0000 Subject: [PATCH 217/340] net: sched: extend Qdisc with rcu stable inclusion from linux-4.19.221 commit f602ed9f8574512e7ea1ab65c3db7ba71053bf27 category: bugfix CVE: CVE-2021-39713 -------------------------------- [ Upstream commit 3a7d0d07a386716b459b00783b11a8211cefcc0f ] Currently, Qdisc API functions assume that users have rtnl lock taken. To implement rtnl unlocked classifiers update interface, Qdisc API must be extended with functions that do not require rtnl lock. Extend Qdisc structure with rcu. Implement special version of put function qdisc_put_unlocked() that is called without rtnl lock taken. This function only takes rtnl lock if Qdisc reference counter reached zero and is intended to be used as optimization. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller [Lee: Sent to Stable] Link: https://syzkaller.appspot.com/bug?id=d7e411c5472dd5da33d8cc921ccadc747743a568 Reported-by: syzbot+5f229e48cccc804062c0@syzkaller.appspotmail.com Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zhengchao Shao Conflict: net/sched/sch_generic.c Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- include/linux/rtnetlink.h | 5 +++++ include/net/pkt_sched.h | 1 + include/net/sch_generic.h | 1 + net/sched/sch_api.c | 18 ++++++++++++++++++ net/sched/sch_generic.c | 9 ++++++++- 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 9cdd76348d9a..bb9cb84114c1 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -85,6 +85,11 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) return rtnl_dereference(dev->ingress_queue); } +static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev) +{ + return rcu_dereference(dev->ingress_queue); +} + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); #ifdef CONFIG_NET_INGRESS diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index edca90ef3bdc..1a6ac924266d 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -103,6 +103,7 @@ int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); +struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c9cd5086bd54..7ac17721c41a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -108,6 +108,7 @@ struct Qdisc { spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; + struct rcu_head rcu; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2c9aa550763d..19c390ab2b7c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -315,6 +315,24 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) return q; } +struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) +{ + struct netdev_queue *nq; + struct Qdisc *q; + + if (!handle) + return NULL; + q = qdisc_match_from_root(dev->qdisc, handle); + if (q) + goto out; + + nq = dev_ingress_queue_rcu(dev); + if (nq) + q = qdisc_match_from_root(nq->qdisc_sleeping, handle); +out: + return q; +} + static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index c2276a3c9dd8..ef41320a9e96 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -953,6 +953,13 @@ void qdisc_free(struct Qdisc *qdisc) kfree((char *) qdisc - qdisc->padded); } +void qdisc_free_cb(struct rcu_head *head) +{ + struct Qdisc *q = container_of(head, struct Qdisc, rcu); + + qdisc_free(q); +} + void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops; @@ -990,7 +997,7 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(skb); } - qdisc_free(qdisc); + call_rcu(&qdisc->rcu, qdisc_free_cb); } EXPORT_SYMBOL(qdisc_destroy); -- Gitee From 823b2cfc16d38dc146e599677f71ec86f5fff876 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Sat, 16 Apr 2022 07:27:54 +0000 Subject: [PATCH 218/340] net: sched: add helper function to take reference to Qdisc stable inclusion from linux-4.19.221 commit da1d324088c40fa0a382224c466175fc5c704106 category: bugfix CVE: CVE-2021-39713 -------------------------------- [ Upstream commit 9d7e82cec35c027756ec97e274f878251f271181 ] Implement function to take reference to Qdisc that relies on rcu read lock instead of rtnl mutex. Function only takes reference to Qdisc if reference counter isn't zero. Intended to be used by unlocked cls API. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller [Lee: Sent to Stable] Link: https://syzkaller.appspot.com/bug?id=d7e411c5472dd5da33d8cc921ccadc747743a568 Reported-by: syzbot+5f229e48cccc804062c0@syzkaller.appspotmail.com Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zhengchao Shao Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- include/net/sch_generic.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7ac17721c41a..b86acf3d1ea6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -118,6 +118,19 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc) refcount_inc(&qdisc->refcnt); } +/* Intended to be used by unlocked users, when concurrent qdisc release is + * possible. + */ + +static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN) + return qdisc; + if (refcount_inc_not_zero(&qdisc->refcnt)) + return qdisc; + return NULL; +} + static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) -- Gitee From bad618de2fea9395c3e6a04f007bc88f801fc52b Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Sat, 16 Apr 2022 07:27:55 +0000 Subject: [PATCH 219/340] net: sched: use Qdisc rcu API instead of relying on rtnl lock stable inclusion from linux-4.19.221 commit ae214e04b95ff64a4b0e9aab6742520bfde6ff0c category: bugfix CVE: CVE-2021-39713 -------------------------------- [ Upstream commit e368fdb61d8e7c67ac70791b23345b26d7bbc661 ] As a preparation from removing rtnl lock dependency from rules update path, use Qdisc rcu and reference counting capabilities instead of relying on rtnl lock while working with Qdiscs. Create new tcf_block_release() function, and use it to free resources taken by tcf_block_find(). Currently, this function only releases Qdisc and it is extended in next patches in this series. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller [Lee: Sent to Stable] Link: https://syzkaller.appspot.com/bug?id=d7e411c5472dd5da33d8cc921ccadc747743a568 Reported-by: syzbot+5f229e48cccc804062c0@syzkaller.appspotmail.com Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zhengchao Shao Conflict: net/sched/cls_api.c Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- net/sched/cls_api.c | 79 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 4413aa8d4e82..ff05f08010c0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -539,6 +539,7 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, struct netlink_ext_ack *extack) { struct tcf_block *block; + int err = 0; if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_lookup(net, block_index); @@ -550,55 +551,93 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, const struct Qdisc_class_ops *cops; struct net_device *dev; + rcu_read_lock(); + /* Find link */ - dev = __dev_get_by_index(net, ifindex); - if (!dev) + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); return ERR_PTR(-ENODEV); + } /* Find qdisc */ if (!*parent) { *q = dev->qdisc; *parent = (*q)->handle; } else { - *q = qdisc_lookup(dev, TC_H_MAJ(*parent)); + *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto errout_rcu; } } + *q = qdisc_refcount_inc_nz(*q); + if (!*q) { + NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); + err = -EINVAL; + goto errout_rcu; + } + /* Is it classful? */ cops = (*q)->ops->cl_ops; if (!cops) { NL_SET_ERR_MSG(extack, "Qdisc not classful"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto errout_rcu; } if (!cops->tcf_block) { NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); - return ERR_PTR(-EOPNOTSUPP); + err = -EOPNOTSUPP; + goto errout_rcu; } + /* At this point we know that qdisc is not noop_qdisc, + * which means that qdisc holds a reference to net_device + * and we hold a reference to qdisc, so it is safe to release + * rcu read lock. + */ + rcu_read_unlock(); + /* Do we search for filter, attached to class? */ if (TC_H_MIN(*parent)) { *cl = cops->find(*q, *parent); if (*cl == 0) { NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); - return ERR_PTR(-ENOENT); + err = -ENOENT; + goto errout_qdisc; } } /* And the last stroke */ block = cops->tcf_block(*q, *cl, extack); - if (!block) - return ERR_PTR(-EINVAL); + if (!block) { + err = -EINVAL; + goto errout_qdisc; + } if (tcf_block_shared(block)) { NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); - return ERR_PTR(-EOPNOTSUPP); + err = -EOPNOTSUPP; + goto errout_qdisc; } } return block; + +errout_rcu: + rcu_read_unlock(); +errout_qdisc: + if (*q) + qdisc_destroy(*q); + return ERR_PTR(err); +} + +static void tcf_block_release(struct Qdisc *q, struct tcf_block *block) +{ + if (q) + qdisc_destroy(q); } struct tcf_block_owner_item { @@ -1336,6 +1375,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); if (err == -EAGAIN) /* Replay the request. */ goto replay; @@ -1457,6 +1497,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); return err; } @@ -1542,6 +1583,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); return err; } @@ -1858,7 +1900,8 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); - return -EINVAL; + err = -EINVAL; + goto errout_block; } chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { @@ -1870,23 +1913,27 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, tcf_chain_hold(chain); } else { NL_SET_ERR_MSG(extack, "Filter chain already exists"); - return -EEXIST; + err = -EEXIST; + goto errout_block; } } else { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); - return -ENOENT; + err = -ENOENT; + goto errout_block; } chain = tcf_chain_create(block, chain_index); if (!chain) { NL_SET_ERR_MSG(extack, "Failed to create filter chain"); - return -ENOMEM; + err = -ENOMEM; + goto errout_block; } } } else { if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); - return -EINVAL; + err = -EINVAL; + goto errout_block; } tcf_chain_hold(chain); } @@ -1930,6 +1977,8 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, errout: tcf_chain_put(chain); +errout_block: + tcf_block_release(q, block); if (err == -EAGAIN) /* Replay the request. */ goto replay; -- Gitee From 90aafa084597f886533445325239fc84b762391b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 16 Apr 2022 07:27:56 +0000 Subject: [PATCH 220/340] net_sched: fix a crash in tc_new_tfilter() stable inclusion from linux-4.19.221 commit f9ff09e266ca70c801b9911280f6ae64c9183d85 category: bugfix CVE: CVE-2021-39713 -------------------------------- commit 460b360104d51552a57f39e54b2589c9fd7fa0b3 upstream. When tcf_block_find() fails, it already rollbacks the qdisc refcnt, so its caller doesn't need to clean up this again. Avoid calling qdisc_put() again by resetting qdisc to NULL for callers. Reported-by: syzbot+37b8770e6d5a8220a039@syzkaller.appspotmail.com Fixes: e368fdb61d8e ("net: sched: use Qdisc rcu API instead of relying on rtnl lock") Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zhengchao Shao Conflict: net/sched/cls_api.c Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- net/sched/cls_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ff05f08010c0..3ca18e3a2be9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -629,8 +629,10 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, errout_rcu: rcu_read_unlock(); errout_qdisc: - if (*q) + if (*q) { qdisc_destroy(*q); + *q = NULL; + } return ERR_PTR(err); } -- Gitee From fe484adc98d528c23eed20fd03539b1ee3717547 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Sat, 16 Apr 2022 07:27:57 +0000 Subject: [PATCH 221/340] net: sched: adapt Qdisc kabi hulk inclusion category: bugfix CVE: CVE-2021-39713 -------------------------------- To adapt to KABI, put rcu before gso_skb for 64-bit kernel. RCU will use 16 Bytes, and the space is enough. It's unuse for 32-bit kernel. Signed-off-by: Zhengchao Shao Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- include/net/sch_generic.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index b86acf3d1ea6..f5f45390828d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -93,7 +93,13 @@ struct Qdisc { struct gnet_stats_queue __percpu *cpu_qstats; int padded; refcount_t refcnt; - +#ifndef __GENKSYMS__ + /* To adapt to KABI, put rcu before gso_skb for 64-bit kernel. + * RCU will use 16 Bytes, and the space is enough. It's unuse + * for 32-bit kernel. + */ + struct rcu_head rcu; +#endif /* * For performance sake on SMP, we put highly modified fields at the end */ @@ -108,7 +114,6 @@ struct Qdisc { spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; - struct rcu_head rcu; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) -- Gitee From d138b1072c07c9c7e800eab0ce9cbe8481c455a3 Mon Sep 17 00:00:00 2001 From: Zhang Wensheng Date: Sat, 16 Apr 2022 07:27:58 +0000 Subject: [PATCH 222/340] nbd: fix possible overflow on 'first_minor' in nbd_dev_add() hulk inclusion category: bugfix, https://gitee.com/openeuler/kernel/issues/I51ABL bugzilla: 186386 CVE: NA -------------------------------- When 'index' is a big numbers, it may become negative which forced to 'int'. then 'index << part_shift' might overflow to a positive value that is not greater than '0xfffff', then sysfs might complains about duplicate creation. Because of this, move the 'index' judgment to the front will fix it and be better. Fixes: b0d9111a2d53 ("nbd: use an idr to keep track of nbd devices") Fixes: 940c264984fd ("nbd: fix possible overflow for 'first_minor' in nbd_dev_add()") Signed-off-by: Zhang Wensheng Reviewed-by: Jason Yan Signed-off-by: Yongqiang Liu --- drivers/block/nbd.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a35189098581..c2e5ba599418 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1704,14 +1704,6 @@ static int nbd_dev_add(int index) int err = -ENOMEM; int first_minor = index << part_shift; - /* - * Too big index can cause duplicate creation of sysfs files/links, - * because MKDEV() expect that the max first minor is MINORMASK, or - * index << part_shift can overflow. - */ - if (first_minor < index || first_minor > MINORMASK) - return -EINVAL; - nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL); if (!nbd) goto out; @@ -1844,8 +1836,20 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - if (info->attrs[NBD_ATTR_INDEX]) + if (info->attrs[NBD_ATTR_INDEX]) { index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); + + /* + * Too big first_minor can cause duplicate creation of + * sysfs files/links, since index << part_shift might + * overflow, or MKDEV() expect that the max bits of + * first_minor is 20. + */ + if (index < 0 || index > MINORMASK >> part_shift) { + printk(KERN_ERR "nbd: illegal input index %d\n", index); + return -EINVAL; + } + } if (!info->attrs[NBD_ATTR_SOCKETS]) { printk(KERN_ERR "nbd: must specify at least one socket\n"); return -EINVAL; -- Gitee From d9c6a251740db75a3b8b69eb4e0cd784d57794f7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 16 Apr 2022 09:17:54 +0000 Subject: [PATCH 223/340] netfilter: nf_tables: initialize registers in nft_do_chain() mainline inclusion from mainline-v5.18-rc1 commit 4c905f6740a365464e91467aa50916555b28213d category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I50WAZ CVE: CVE-2022-1016 ------------------------------------------------- Initialize registers to avoid stack leak into userspace. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso conflict: net/netfilter/nf_tables_core.c Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/netfilter/nf_tables_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index a3850414dba2..7dfaad783cd5 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -144,7 +144,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) struct nft_rule *const *rules; const struct nft_rule *rule; const struct nft_expr *expr, *last; - struct nft_regs regs; + struct nft_regs regs = {}; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; bool genbit = READ_ONCE(net->nft.gencursor); -- Gitee From 742a0b5b67461e695f406a3b028860cfef1ae838 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Tue, 19 Apr 2022 01:27:54 +0000 Subject: [PATCH 224/340] sched/fair: Fix wrong cpu selecting from isolated domain mainline inclusion from mainline-v5.10-rc1 commit df3cb4ea1fb63ff326488efd671ba3c39034255e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I52QKB CVE: NA -------------------------------- We've met problems that occasionally tasks with full cpumask (e.g. by putting it into a cpuset or setting to full affinity) were migrated to our isolated cpus in production environment. After some analysis, we found that it is due to the current select_idle_smt() not considering the sched_domain mask. Steps to reproduce on my 31-CPU hyperthreads machine: 1. with boot parameter: "isolcpus=domain,2-31" (thread lists: 0,16 and 1,17) 2. cgcreate -g cpu:test; cgexec -g cpu:test "test_threads" 3. some threads will be migrated to the isolated cpu16~17. Fix it by checking the valid domain mask in select_idle_smt(). Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings()) Reported-by: Wetp Zhang Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/1600930127-76857-1-git-send-email-xlpang@linux.alibaba.com Conflicts: kernel/sched/fair.c Signed-off-by: Yu jiahua Reviewed-by: zheng zucheng Reviewed-by: zheng zucheng Reviewed-by: Zhang Qiao Signed-off-by: Yongqiang Liu --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c7c0144a852..89937f7ca0bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6256,7 +6256,8 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; -- Gitee From af98db5ff58f3657d68ac5f744de3c9ad69388ac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Apr 2022 01:27:55 +0000 Subject: [PATCH 225/340] sched: Fix yet more sched_fork() races mainline inclusion from mainline-v5.17-rc5 commit b1e8206582f9d680cff7d04828708c8b6ab32957 category: bugfix bugzilla: 186609, https://gitee.com/openeuler/kernel/issues/I532B0 CVE: NA -------------------------------- Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tadeusz Struk Tested-by: Zhang Qiao Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net conflict: include/linux/sched/task.h kernel/fork.c kernel/sched/core.c Signed-off-by: Zhang Qiao Reviewed-by: Chen Hui Reviewed-by: Wang Weiyang Signed-off-by: Yongqiang Liu --- include/linux/sched/task.h | 2 +- kernel/fork.c | 12 +++++++++++- kernel/sched/core.c | 6 +++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index c8b52b3ec865..1c2c099e393b 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -33,7 +33,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p); +extern void sched_cgroup_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index 88463fd56930..231b01eba6e1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2063,6 +2063,17 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_free_futex_mutex; + /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p); + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do @@ -2171,7 +2182,6 @@ static __latent_entropy struct task_struct *copy_process( proc_fork_connector(p); cgroup_post_fork(p); - sched_post_fork(p); cgroup_threadgroup_change_end(current); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 496ce71f93a7..b09153710259 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2357,8 +2357,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p) +void sched_cgroup_fork(struct task_struct *p) { + unsigned long flags; /* @@ -2369,6 +2370,9 @@ void sched_post_fork(struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_CGROUP_SCHED + p->sched_task_group = task_group(current); + #endif rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, -- Gitee From b759debf9a3c0d5baf38aa4575bef80689672b88 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 19 Apr 2022 01:27:56 +0000 Subject: [PATCH 226/340] Revert "ext4: fix file system corrupted when rmdir non empty directory with IO error" hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I52WOM CVE: NA -------------------------------- This reverts commit 35bd50bcc346402fb5bac3e3393876f6c5644ca9. Signed-off-by: Ye Bin Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/ext4/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 84f68e15e853..58dd74b80480 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2750,7 +2750,7 @@ bool ext4_empty_dir(struct inode *inode) */ bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); if (IS_ERR(bh)) - return false; + return true; de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, @@ -2781,7 +2781,7 @@ bool ext4_empty_dir(struct inode *inode) continue; } if (IS_ERR(bh)) - return false; + return true; } de = (struct ext4_dir_entry_2 *) (bh->b_data + (offset & (sb->s_blocksize - 1))); -- Gitee From 5ac697029bbf28b7bd762b3558ec79024c75aa00 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 19 Apr 2022 01:27:57 +0000 Subject: [PATCH 227/340] ext4: fix fs corruption when tring to remove a non-empty directory with IO error mainline inclusion from mainline-v5.18-rc1 commit 7aab5c84a0f6ec2290e2ba4a6b245178b1bf949a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I52WOM CVE: NA -------------------------------- We inject IO error when rmdir non empty direcory, then got issue as follows: step1: mkfs.ext4 -F /dev/sda step2: mount /dev/sda test step3: cd test step4: mkdir -p 1/2 step5: rmdir 1 [ 110.920551] ext4_empty_dir: inject fault [ 110.921926] EXT4-fs warning (device sda): ext4_rmdir:3113: inode #12: comm rmdir: empty directory '1' has too many links (3) step6: cd .. step7: umount test step8: fsck.ext4 -f /dev/sda e2fsck 1.42.9 (28-Dec-2013) Pass 1: Checking inodes, blocks, and sizes Pass 2: Checking directory structure Entry '..' in .../??? (13) has deleted/unused inode 12. Clear? yes Pass 3: Checking directory connectivity Unconnected directory inode 13 (...) Connect to /lost+found? yes Pass 4: Checking reference counts Inode 13 ref count is 3, should be 2. Fix? yes Pass 5: Checking group summary information /dev/sda: ***** FILE SYSTEM WAS MODIFIED ***** /dev/sda: 12/131072 files (0.0% non-contiguous), 26157/524288 blocks ext4_rmdir if (!ext4_empty_dir(inode)) goto end_rmdir; ext4_empty_dir bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); if (IS_ERR(bh)) return true; Now if read directory block failed, 'ext4_empty_dir' will return true, assume directory is empty. Obviously, it will lead to above issue. To solve this issue, if read directory block failed 'ext4_empty_dir' just return false. To avoid making things worse when file system is already corrupted, 'ext4_empty_dir' also return false. Signed-off-by: Ye Bin Cc: stable@kernel.org Link: https://lore.kernel.org/r/20220228024815.3952506-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o conflicts: fs/ext4/namei.c Signed-off-by: Ye Bin Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/ext4/inline.c | 9 ++++----- fs/ext4/namei.c | 10 +++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index abb71ba0b1e6..b5d172bf3318 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1775,19 +1775,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; - bool ret = true; + bool ret = false; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, "error %d getting inode %lu block", err, dir->i_ino); - return true; + return false; } down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; + ret = true; goto out; } @@ -1796,7 +1797,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); - ret = true; goto out; } @@ -1815,16 +1815,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); - ret = true; goto out; } if (le32_to_cpu(de->inode)) { - ret = false; goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); } + ret = true; out: up_read(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 58dd74b80480..867deea2fbdb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2743,14 +2743,14 @@ bool ext4_empty_dir(struct inode *inode) sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { EXT4_ERROR_INODE(inode, "invalid size"); - return true; + return false; } /* The first directory block must not be a hole, * so treat it as DIRENT_HTREE */ bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); if (IS_ERR(bh)) - return true; + return false; de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, @@ -2758,7 +2758,7 @@ bool ext4_empty_dir(struct inode *inode) le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); - return true; + return false; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); @@ -2767,7 +2767,7 @@ bool ext4_empty_dir(struct inode *inode) le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); - return true; + return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { @@ -2781,7 +2781,7 @@ bool ext4_empty_dir(struct inode *inode) continue; } if (IS_ERR(bh)) - return true; + return false; } de = (struct ext4_dir_entry_2 *) (bh->b_data + (offset & (sb->s_blocksize - 1))); -- Gitee From 2a98e82a07909c97bc33d47e895b76dc9ca4a4fc Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 20 Apr 2022 11:53:20 +0000 Subject: [PATCH 228/340] mm/memory.c: fix clear_gigantic_page_chunk hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5323R CVE: NA -------------------------------- Patch "mm: parallelize clear_gigantic_page" put mem_map_next() after clear_user_highpage but not modify the parameter i. So it's actually the previous page being cleared in each loop. As a result, the last page has not been cleared. Fix it by put mem_map_next() back to the original position. Fixes: ae0cd4d46ced ("mm: parallelize clear_gigantic_page") Signed-off-by: Liu Shixin Reviewed-by: Wei Li Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d4853970a7c1..98c2872f8415 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4726,11 +4726,9 @@ static int clear_gigantic_page_chunk(unsigned long start, unsigned long end, unsigned long i; might_sleep(); - for (i = start; i < end; ++i) { + for (i = start; i < end; i++, p = mem_map_next(p, base_page, i)) { cond_resched(); clear_user_highpage(p, addr + i * PAGE_SIZE); - - p = mem_map_next(p, base_page, i); } return KTASK_RETURN_SUCCESS; -- Gitee From b5aab4b87f82f5f3d78053df16faa72c62457c7c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 20 Apr 2022 11:53:21 +0000 Subject: [PATCH 229/340] x86/speculation: Merge one test in spectre_v2_user_select_mitigation() stable inclusion from stable-v4.19.234 commit d86fc674172edc0794f84198405f59f00b71512a category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit a5ce9f2bb665d1d2b31f139a02dbaa2dfbb62fa6 upstream. Merge the test whether the CPU supports STIBP into the test which determines whether STIBP is required. Thus try to simplify what is already an insane logic. Remove a superfluous newline in a comment, while at it. Signed-off-by: Borislav Petkov Cc: Anthony Steinhauser Link: https://lkml.kernel.org/r/20200615065806.GB14668@zn.tnic [fllinden@amazon.com: fixed contextual conflict (comment) for 4.19] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman Conflicts: arch/x85/kernel/cpu/bugs.c Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/x86/kernel/cpu/bugs.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 54ec900bd259..7dd614930801 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -756,10 +756,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not * required. */ - if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; /* @@ -771,12 +773,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* - * If STIBP is not available, clear the STIBP mode. - */ - if (!boot_cpu_has(X86_FEATURE_STIBP)) - mode = SPECTRE_V2_USER_NONE; - spectre_v2_user_stibp = mode; set_mode: -- Gitee From 989c0a1dc345ef7e97c64e38957b58b236f2a262 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Apr 2022 11:53:22 +0000 Subject: [PATCH 230/340] x86,bugs: Unconditionally allow spectre_v2=retpoline,amd stable inclusion from stable-v4.19.234 commit 1dd7efcfcccee59c974f56848739d4f5a5a570c7 category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit f8a66d608a3e471e1202778c2a36cbdc96bae73b upstream. Currently Linux prevents usage of retpoline,amd on !AMD hardware, this is unfriendly and gets in the way of testing. Remove this restriction. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.487348118@infradead.org [fllinden@amazon.com: backported to 4.19 (no Hygon in 4.19)] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman Conflicts: arch/x85/kernel/cpu/bugs.c Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/x86/kernel/cpu/bugs.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7dd614930801..791f7ee58b82 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -839,13 +839,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); - return SPECTRE_V2_CMD_AUTO; - } - spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; -- Gitee From 827e6afa42b1bdf60c02583061237a254512e441 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 20 Apr 2022 11:53:23 +0000 Subject: [PATCH 231/340] x86/speculation: Rename RETPOLINE_AMD to RETPOLINE_LFENCE stable inclusion from stable-v4.19.234 commit 25440a8c77dd2fde6a8e9cfc0c616916febf408e category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit d45476d9832409371537013ebdd8dc1a7781f97a upstream. The RETPOLINE_AMD name is unfortunate since it isn't necessarily AMD only, in fact Hygon also uses it. Furthermore it will likely be sufficient for some Intel processors. Therefore rename the thing to RETPOLINE_LFENCE to better describe what it is. Add the spectre_v2=retpoline,lfence option as an alias to spectre_v2=retpoline,amd to preserve existing setups. However, the output of /sys/devices/system/cpu/vulnerabilities/spectre_v2 will be changed. [ bp: Fix typos, massage. ] Co-developed-by: Josh Poimboeuf Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner [fllinden@amazon.com: backported to 4.19] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman Conflicts: arch/x85/kernel/cpu/bugs.c Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/nospec-branch.h | 12 +++++----- arch/x86/kernel/cpu/bugs.c | 29 +++++++++++++++--------- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0a4145100c3a..a75ef95455d9 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -224,7 +224,7 @@ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e3f70c60e8cc..5bc65944fd6e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -119,7 +119,7 @@ ANNOTATE_NOSPEC_ALTERNATIVE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_LFENCE #else jmp *\reg #endif @@ -130,7 +130,7 @@ ANNOTATE_NOSPEC_ALTERNATIVE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_LFENCE #else call *\reg #endif @@ -181,7 +181,7 @@ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #else /* CONFIG_X86_32 */ @@ -211,7 +211,7 @@ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -223,8 +223,8 @@ /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, - SPECTRE_V2_RETPOLINE_GENERIC, - SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_RETPOLINE, + SPECTRE_V2_LFENCE, SPECTRE_V2_IBRS_ENHANCED, }; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 791f7ee58b82..09d03300a626 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -621,7 +621,7 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, - SPECTRE_V2_CMD_RETPOLINE_AMD, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, }; enum spectre_v2_user_cmd { @@ -781,8 +781,8 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", }; @@ -794,7 +794,8 @@ static const struct { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, }; @@ -832,13 +833,19 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && !IS_ENABLED(CONFIG_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; @@ -873,9 +880,9 @@ static void __init spectre_v2_select_mitigation(void) if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; - case SPECTRE_V2_CMD_RETPOLINE_AMD: + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_amd; + goto retpoline_lfence; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: if (IS_ENABLED(CONFIG_RETPOLINE)) @@ -892,17 +899,17 @@ static void __init spectre_v2_select_mitigation(void) retpoline_auto: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_amd: + retpoline_lfence: if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); goto retpoline_generic; } - mode = SPECTRE_V2_RETPOLINE_AMD; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + mode = SPECTRE_V2_LFENCE; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } else { retpoline_generic: - mode = SPECTRE_V2_RETPOLINE_GENERIC; + mode = SPECTRE_V2_RETPOLINE; setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 765df93de0c6..04a8f0c17609 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -203,7 +203,7 @@ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCEs for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ -- Gitee From 5aa35f5ddfd7545a42e28df8fc76936efe7a5f72 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Apr 2022 11:53:24 +0000 Subject: [PATCH 232/340] x86/speculation: Add eIBRS + Retpoline options stable inclusion from stable-v4.19.234 commit 3f66bedb96ff4c064a819e68499f79b38297ba26 category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit 1e19da8522c81bf46b335f84137165741e0d82b7 upstream. Thanks to the chaps at VUsec it is now clear that eIBRS is not sufficient, therefore allow enabling of retpolines along with eIBRS. Add spectre_v2=eibrs, spectre_v2=eibrs,lfence and spectre_v2=eibrs,retpoline options to explicitly pick your preferred means of mitigation. Since there's new mitigations there's also user visible changes in /sys/devices/system/cpu/vulnerabilities/spectre_v2 to reflect these new mitigations. [ bp: Massage commit message, trim error messages, do more precise eIBRS mode checking. ] Co-developed-by: Josh Poimboeuf Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Patrick Colp Reviewed-by: Thomas Gleixner [fllinden@amazon.com: backported to 4.19 (no Hygon)] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman Conflicts: arch/x85/kernel/cpu/bugs.c Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/x86/include/asm/nospec-branch.h | 4 +- arch/x86/kernel/cpu/bugs.c | 132 +++++++++++++++++++-------- 2 files changed, 98 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 5bc65944fd6e..ddcd4d5eecec 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -225,7 +225,9 @@ enum spectre_v2_mitigation { SPECTRE_V2_NONE, SPECTRE_V2_RETPOLINE, SPECTRE_V2_LFENCE, - SPECTRE_V2_IBRS_ENHANCED, + SPECTRE_V2_EIBRS, + SPECTRE_V2_EIBRS_RETPOLINE, + SPECTRE_V2_EIBRS_LFENCE, }; /* The indirect branch speculation control variants */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 09d03300a626..9da94659e89f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -622,6 +622,9 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, }; enum spectre_v2_user_cmd { @@ -694,6 +697,13 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return (mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE); +} + static void __init spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) { @@ -761,7 +771,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return; /* @@ -783,7 +793,9 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", }; static const struct { @@ -797,6 +809,9 @@ static const struct { { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, }; @@ -834,15 +849,29 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) if ((cmd == SPECTRE_V2_CMD_RETPOLINE || cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE) && + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -851,6 +880,24 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE not serializing, switching to generic retpoline\n"); + return SPECTRE_V2_RETPOLINE; + } + return SPECTRE_V2_LFENCE; + } + + return SPECTRE_V2_RETPOLINE; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -871,49 +918,60 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_IBRS_ENHANCED; - /* Force it so VMEXIT will restore correctly */ - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - goto specv2_set_mode; + mode = SPECTRE_V2_EIBRS; + break; } - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + + mode = spectre_v2_select_retpoline(); break; + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_lfence; + mode = SPECTRE_V2_LFENCE; break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_generic; + mode = SPECTRE_V2_RETPOLINE; break; + case SPECTRE_V2_CMD_RETPOLINE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; break; } - pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); - return; -retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_lfence: - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); - goto retpoline_generic; - } - mode = SPECTRE_V2_LFENCE; + if (spectre_v2_in_eibrs_mode(mode)) { + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + /* fallthrough */ + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); - } else { - retpoline_generic: - mode = SPECTRE_V2_RETPOLINE; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; } -specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -939,7 +997,7 @@ static void __init spectre_v2_select_mitigation(void) * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { + if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } @@ -1584,7 +1642,7 @@ static ssize_t tsx_async_abort_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return ""; switch (spectre_v2_user_stibp) { -- Gitee From f72da20eb6593c02cef40233f34c6f9db89b1489 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Apr 2022 11:53:25 +0000 Subject: [PATCH 233/340] Documentation/hw-vuln: Update spectre doc stable inclusion from stable-v4.19.234 commit 7af95ef3ec6248696300fce5c68f6c8c4f50e4a4 category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit 5ad3eb1132453b9795ce5fd4572b1c18b292cca9 upstream. Update the doc with the new fun. [ bp: Massage commit message. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner [fllinden@amazon.com: backported to 4.19] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- Documentation/admin-guide/hw-vuln/spectre.rst | 42 +++++++++++++------ .../admin-guide/kernel-parameters.txt | 8 +++- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index e05e581af5cf..4776c112fe7d 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -131,6 +131,19 @@ steer its indirect branch speculations to gadget code, and measure the speculative execution's side effects left in level 1 cache to infer the victim's data. +Yet another variant 2 attack vector is for the attacker to poison the +Branch History Buffer (BHB) to speculatively steer an indirect branch +to a specific Branch Target Buffer (BTB) entry, even if the entry isn't +associated with the source address of the indirect branch. Specifically, +the BHB might be shared across privilege levels even in the presence of +Enhanced IBRS. + +Currently the only known real-world BHB attack vector is via +unprivileged eBPF. Therefore, it's highly recommended to not enable +unprivileged eBPF, especially when eIBRS is used (without retpolines). +For a full mitigation against BHB attacks, it's recommended to use +retpolines (or eIBRS combined with retpolines). + Attack scenarios ---------------- @@ -364,13 +377,15 @@ The possible values in this file are: - Kernel status: - ==================================== ================================= - 'Not affected' The processor is not vulnerable - 'Vulnerable' Vulnerable, no mitigation - 'Mitigation: Full generic retpoline' Software-focused mitigation - 'Mitigation: Full AMD retpoline' AMD-specific software mitigation - 'Mitigation: Enhanced IBRS' Hardware-focused mitigation - ==================================== ================================= + ======================================== ================================= + 'Not affected' The processor is not vulnerable + 'Mitigation: None' Vulnerable, no mitigation + 'Mitigation: Retpolines' Use Retpoline thunks + 'Mitigation: LFENCE' Use LFENCE instructions + 'Mitigation: Enhanced IBRS' Hardware-focused mitigation + 'Mitigation: Enhanced IBRS + Retpolines' Hardware-focused + Retpolines + 'Mitigation: Enhanced IBRS + LFENCE' Hardware-focused + LFENCE + ======================================== ================================= - Firmware status: Show if Indirect Branch Restricted Speculation (IBRS) is used to protect against Spectre variant 2 attacks when calling firmware (x86 only). @@ -584,12 +599,13 @@ kernel command line. Specific mitigations can also be selected manually: - retpoline - replace indirect branches - retpoline,generic - google's original retpoline - retpoline,amd - AMD-specific minimal thunk + retpoline auto pick between generic,lfence + retpoline,generic Retpolines + retpoline,lfence LFENCE; indirect branch + retpoline,amd alias for retpoline,lfence + eibrs enhanced IBRS + eibrs,retpoline enhanced IBRS + Retpolines + eibrs,lfence enhanced IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 170e25b192a1..212832a5ca68 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4413,8 +4413,12 @@ Specific mitigations can also be selected manually: retpoline - replace indirect branches - retpoline,generic - google's original retpoline - retpoline,amd - AMD-specific minimal thunk + retpoline,generic - Retpolines + retpoline,lfence - LFENCE; indirect branch + retpoline,amd - alias for retpoline,lfence + eibrs - enhanced IBRS + eibrs,retpoline - enhanced IBRS + Retpolines + eibrs,lfence - enhanced IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. -- Gitee From 8fbdf654b00fdf629be3b94c4f64182b9522774a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Apr 2022 11:53:26 +0000 Subject: [PATCH 234/340] x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting stable inclusion from stable-v4.19.234 commit 995629e1d8e6751936c6e2b738f70b392b0461de category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit 44a3918c8245ab10c6c9719dd12e7a8d291980d8 upstream. With unprivileged eBPF enabled, eIBRS (without retpoline) is vulnerable to Spectre v2 BHB-based attacks. When both are enabled, print a warning message and report it in the 'spectre_v2' sysfs vulnerabilities file. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner [fllinden@amazon.com: backported to 4.19] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman Conflicts: kernel/sysctl.c Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/x86/kernel/cpu/bugs.c | 35 +++++++++++++++++++++++++++++------ include/linux/bpf.h | 11 +++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9da94659e89f..9a9e5ac44bd5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "cpu.h" @@ -607,6 +608,16 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -950,6 +961,9 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + if (spectre_v2_in_eibrs_mode(mode)) { /* Force it so VMEXIT will restore correctly */ x86_spec_ctrl_base |= SPEC_CTRL_IBRS; @@ -1672,6 +1686,20 @@ static char *ibpb_state(void) return ""; } +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); @@ -1697,12 +1725,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c40fa25cf2ca..30fcf812df9d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -546,6 +546,11 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); int array_map_alloc_check(union bpf_attr *attr); +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -657,6 +662,12 @@ static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, { return ERR_PTR(-EOPNOTSUPP); } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, -- Gitee From 802b5d021cbd79845326b08da67e6c73d8e5896f Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Wed, 20 Apr 2022 11:53:27 +0000 Subject: [PATCH 235/340] x86/speculation: Use generic retpoline by default on AMD stable inclusion from stable-v4.19.234 commit d3cb3a6927222268a10b2f12dfb8c9444f7cc39e category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit 244d00b5dd4755f8df892c86cab35fb2cfd4f14b upstream. AMD retpoline may be susceptible to speculation. The speculation execution window for an incorrect indirect branch prediction using LFENCE/JMP sequence may potentially be large enough to allow exploitation using Spectre V2. By default, don't use retpoline,lfence on AMD. Instead, use the generic retpoline. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/x86/kernel/cpu/bugs.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9a9e5ac44bd5..dc9ffb9b7615 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -898,14 +898,6 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_NONE; } - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("LFENCE not serializing, switching to generic retpoline\n"); - return SPECTRE_V2_RETPOLINE; - } - return SPECTRE_V2_LFENCE; - } - return SPECTRE_V2_RETPOLINE; } -- Gitee From de9842d50028c34b59eae563bdb7628564c6f05d Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Wed, 20 Apr 2022 11:53:28 +0000 Subject: [PATCH 236/340] x86/speculation: Update link to AMD speculation whitepaper stable inclusion from stable-v4.19.234 commit c034d344e733a3ac574dd09e39e911a50025c607 category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit e9b6013a7ce31535b04b02ba99babefe8a8599fa upstream. Update the link to the "Software Techniques for Managing Speculation on AMD Processors" whitepaper. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- Documentation/admin-guide/hw-vuln/spectre.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 4776c112fe7d..3a6e1fcad3c0 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -60,8 +60,8 @@ privileged data touched during the speculative execution. Spectre variant 1 attacks take advantage of speculative execution of conditional branches, while Spectre variant 2 attacks use speculative execution of indirect branches to leak privileged memory. -See :ref:`[1] ` :ref:`[5] ` :ref:`[7] ` -:ref:`[10] ` :ref:`[11] `. +See :ref:`[1] ` :ref:`[5] ` :ref:`[6] ` +:ref:`[7] ` :ref:`[10] ` :ref:`[11] `. Spectre variant 1 (Bounds Check Bypass) --------------------------------------- @@ -746,7 +746,7 @@ AMD white papers: .. _spec_ref6: -[6] `Software techniques for managing speculation on AMD processors `_. +[6] `Software techniques for managing speculation on AMD processors `_. ARM white papers: -- Gitee From 50f090e8089195a9df8f9c690103268caa88a7f3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Apr 2022 11:53:29 +0000 Subject: [PATCH 237/340] x86/speculation: Warn about Spectre v2 LFENCE mitigation stable inclusion from stable-v4.19.234 commit 8bfdba77595aee5c3e83ed1c9994c35d6d409605 category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit eafd987d4a82c7bb5aa12f0e3b4f8f3dea93e678 upstream. With: f8a66d608a3e ("x86,bugs: Unconditionally allow spectre_v2=retpoline,amd") it became possible to enable the LFENCE "retpoline" on Intel. However, Intel doesn't recommend it, as it has some weaknesses compared to retpoline. Now AMD doesn't recommend it either. It can still be left available as a cmdline option. It's faster than retpoline but is weaker in certain scenarios -- particularly SMT, but even non-SMT may be vulnerable in some cases. So just unconditionally warn if the user requests it on the cmdline. [ bp: Massage commit message. ] Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/x86/kernel/cpu/bugs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index dc9ffb9b7615..138bb1b0093e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -608,6 +608,7 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #ifdef CONFIG_BPF_SYSCALL @@ -929,6 +930,7 @@ static void __init spectre_v2_select_mitigation(void) break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); mode = SPECTRE_V2_LFENCE; break; @@ -1680,6 +1682,9 @@ static char *ibpb_state(void) static ssize_t spectre_v2_show_state(char *buf) { + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sprintf(buf, "Vulnerable: LFENCE\n"); + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); -- Gitee From 5308da81e586a2f124b9f5a12818dcba057ab786 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Apr 2022 11:53:30 +0000 Subject: [PATCH 238/340] x86/speculation: Warn about eIBRS + LFENCE + Unprivileged eBPF + SMT stable inclusion from stable-v4.19.234 commit 9711b12a3f4c0fc73dd257c1e467e6e42155a5f1 category: bugfix bugzilla: 186453, https://gitee.com/src-openeuler/kernel/issues/I50WBM CVE: CVE-2022-0001 -------------------------------- commit 0de05d056afdb00eca8c7bbb0c79a3438daf700c upstream. The commit 44a3918c8245 ("x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting") added a warning for the "eIBRS + unprivileged eBPF" combination, which has been shown to be vulnerable against Spectre v2 BHB-based attacks. However, there's no warning about the "eIBRS + LFENCE retpoline + unprivileged eBPF" combo. The LFENCE adds more protection by shortening the speculation window after a mispredicted branch. That makes an attack significantly more difficult, even with unprivileged eBPF. So at least for now the logic doesn't warn about that combination. But if you then add SMT into the mix, the SMT attack angle weakens the effectiveness of the LFENCE considerably. So extend the "eIBRS + unprivileged eBPF" warning to also include the "eIBRS + LFENCE + unprivileged eBPF + SMT" case. [ bp: Massage commit message. ] Suggested-by: Alyssa Milburn Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/x86/kernel/cpu/bugs.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 138bb1b0093e..4651e9db46a6 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -610,12 +610,27 @@ static inline const char *spectre_v2_module_string(void) { return ""; } #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) { - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } } #endif @@ -1075,6 +1090,10 @@ void arch_smt_update(void) { mutex_lock(&spec_ctrl_mutex); + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; @@ -1686,7 +1705,11 @@ static ssize_t spectre_v2_show_state(char *buf) return sprintf(buf, "Vulnerable: LFENCE\n"); if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) - return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], -- Gitee From 357bd9f9f1bb86c9607957a3a959d791f80dcbf4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:31 +0000 Subject: [PATCH 239/340] arm64: entry.S: Add ventry overflow sanity checks stable inclusion from stable-v4.19.236 commit e8bfe29afc09ac77b347540a0f4c789e6530a436 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 4330e2c5c04c27bebf89d34e0bc14e6943413067 upstream. Subsequent patches add even more code to the ventry slots. Ensure kernels that overflow a ventry slot don't get built. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 9a044d425502..33a1cbe98e36 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -70,6 +70,7 @@ .macro kernel_ventry, el, label, regsize = 64 .align 7 +.Lventry_start\@: #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 alternative_if ARM64_UNMAP_KERNEL_AT_EL0 .if \el == 0 @@ -127,6 +128,7 @@ alternative_else_nop_endif mrs x0, tpidrro_el0 #endif b el\()\el\()_\label +.org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm .macro tramp_alias, dst, sym @@ -1084,6 +1086,7 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 add x30, x30, #(1b - tramp_vectors) isb ret +.org 1b + 128 // Did we overflow the ventry slot? .endm .macro tramp_exit, regsize = 64 -- Gitee From 7a79d423decaf8a0501343781e2e2480c1470823 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:32 +0000 Subject: [PATCH 240/340] arm64: entry: Make the trampoline cleanup optional stable inclusion from stable-v4.19.236 commit 87eccd56c52fcdd6c55b048d789da5c9c2e51ed3 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit d739da1694a0eaef0358a42b76904b611539b77b upstream. Subsequent patches will add additional sets of vectors that use the same tricks as the kpti vectors to reach the full-fat vectors. The full-fat vectors contain some cleanup for kpti that is patched in by alternatives when kpti is in use. Once there are additional vectors, the cleanup will be needed in more cases. But on big/little systems, the cleanup would be harmful if no trampoline vector were in use. Instead of forcing CPUs that don't need a trampoline vector to use one, make the trampoline cleanup optional. Entry at the top of the vectors will skip the cleanup. The trampoline vectors can then skip the first instruction, triggering the cleanup to run. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 33a1cbe98e36..ad5432e73538 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -72,16 +72,20 @@ .align 7 .Lventry_start\@: #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -alternative_if ARM64_UNMAP_KERNEL_AT_EL0 .if \el == 0 + /* + * This must be the first instruction of the EL0 vector entries. It is + * skipped by the trampoline vectors, to trigger the cleanup. + */ + b .Lskip_tramp_vectors_cleanup\@ .if \regsize == 64 mrs x30, tpidrro_el0 msr tpidrro_el0, xzr .else mov x30, xzr .endif +.Lskip_tramp_vectors_cleanup\@: .endif -alternative_else_nop_endif #endif sub sp, sp, #S_FRAME_SIZE @@ -1083,7 +1087,7 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 #endif prfm plil1strm, [x30, #(1b - tramp_vectors)] msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors) + add x30, x30, #(1b - tramp_vectors + 4) isb ret .org 1b + 128 // Did we overflow the ventry slot? -- Gitee From c74a988abc79808cdd1c051e00670f541159487b Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:33 +0000 Subject: [PATCH 241/340] arm64: entry: Free up another register on kpti's tramp_exit path stable inclusion from stable-v4.19.236 commit 51acb81130d1feee7fd043760b75f5377ab8d4f0 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 03aff3a77a58b5b52a77e00537a42090ad57b80b upstream. Kpti stashes x30 in far_el1 while it uses x30 for all its work. Making the vectors a per-cpu data structure will require a second register. Allow tramp_exit two registers before it unmaps the kernel, by leaving x30 on the stack, and stashing x29 in far_el1. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ad5432e73538..29d15f5bccdd 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -365,18 +365,20 @@ alternative_else_nop_endif ldp x24, x25, [sp, #16 * 12] ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] - ldr lr, [sp, #S_LR] - add sp, sp, #S_FRAME_SIZE // restore sp /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on eret context synchronization * when returning from IPI handler, and when returning to user-space. */ .if \el == 0 -alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 + ldr lr, [sp, #S_LR] + add sp, sp, #S_FRAME_SIZE // restore sp + eret +alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f - msr far_el1, x30 + msr far_el1, x29 tramp_alias x30, tramp_exit_native br x30 4: @@ -384,6 +386,8 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 br x30 #endif .else + ldr lr, [sp, #S_LR] + add sp, sp, #S_FRAME_SIZE // restore sp eret .endif .endm @@ -1096,10 +1100,12 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 .macro tramp_exit, regsize = 64 adr x30, tramp_vectors msr vbar_el1, x30 - tramp_unmap_kernel x30 + ldr lr, [sp, #S_LR] + tramp_unmap_kernel x29 .if \regsize == 64 - mrs x30, far_el1 + mrs x29, far_el1 .endif + add sp, sp, #S_FRAME_SIZE // restore sp eret .endm -- Gitee From 88f3bf9c621102b8540696b1ce08e5292c6285c6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:34 +0000 Subject: [PATCH 242/340] arm64: entry: Move the trampoline data page before the text page stable inclusion from stable-v4.19.236 commit 266b1ef1368e06ac4c5a89eb9774ef2bbaa54e19 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit c091fb6ae059cda563b2a4d93fdbc548ef34e1d6 upstream. The trampoline code has a data page that holds the address of the vectors, which is unmapped when running in user-space. This ensures that with CONFIG_RANDOMIZE_BASE, the randomised address of the kernel can't be discovered until after the kernel has been mapped. If the trampoline text page is extended to include multiple sets of vectors, it will be larger than a single page, making it tricky to find the data page without knowing the size of the trampoline text pages, which will vary with PAGE_SIZE. Move the data page to appear before the text page. This allows the data page to be found without knowing the size of the trampoline text pages. 'tramp_vectors' is used to refer to the beginning of the .entry.tramp.text section, do that explicitly. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/include/asm/fixmap.h | 2 +- arch/arm64/kernel/entry.S | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index ec1e6d6fa14c..c0cfc6d3bf9f 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -59,8 +59,8 @@ enum fixed_addresses { #endif /* CONFIG_ACPI_APEI_GHES */ #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - FIX_ENTRY_TRAMP_DATA, FIX_ENTRY_TRAMP_TEXT, + FIX_ENTRY_TRAMP_DATA, #define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 29d15f5bccdd..8de799e92970 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1067,6 +1067,11 @@ alternative_else_nop_endif */ .endm + .macro tramp_data_page dst + adr \dst, .entry.tramp.text + sub \dst, \dst, PAGE_SIZE + .endm + .macro tramp_ventry, regsize = 64 .align 7 1: @@ -1083,7 +1088,7 @@ alternative_else_nop_endif 2: tramp_map_kernel x30 #ifdef CONFIG_RANDOMIZE_BASE - adr x30, tramp_vectors + PAGE_SIZE + tramp_data_page x30 alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 ldr x30, [x30] #else @@ -1231,7 +1236,7 @@ ENTRY(__sdei_asm_entry_trampoline) 1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] #ifdef CONFIG_RANDOMIZE_BASE - adr x4, tramp_vectors + PAGE_SIZE + tramp_data_page x4 add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler ldr x4, [x4] #else -- Gitee From 656f57bdf30b41356f47c132e40577eedfa0253a Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:35 +0000 Subject: [PATCH 243/340] arm64: entry: Allow tramp_alias to access symbols after the 4K boundary stable inclusion from stable-v4.19.236 commit ebcdd80d0016c7445e8395cec99b9ce266a26001 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 6c5bf79b69f911560fbf82214c0971af6e58e682 upstream. Systems using kpti enter and exit the kernel through a trampoline mapping that is always mapped, even when the kernel is not. tramp_valias is a macro to find the address of a symbol in the trampoline mapping. Adding extra sets of vectors will expand the size of the entry.tramp.text section to beyond 4K. tramp_valias will be unable to generate addresses for symbols beyond 4K as it uses the 12 bit immediate of the add instruction. As there are now two registers available when tramp_alias is called, use the extra register to avoid the 4K limit of the 12 bit immediate. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8de799e92970..15e117d7e290 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -135,9 +135,12 @@ .org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm - .macro tramp_alias, dst, sym + .macro tramp_alias, dst, sym, tmp mov_q \dst, TRAMP_VALIAS - add \dst, \dst, #(\sym - .entry.tramp.text) + adr_l \tmp, \sym + add \dst, \dst, \tmp + adr_l \tmp, .entry.tramp.text + sub \dst, \dst, \tmp .endm // This macro corrupts x0-x3. It is the caller's duty @@ -379,10 +382,10 @@ alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f msr far_el1, x29 - tramp_alias x30, tramp_exit_native + tramp_alias x30, tramp_exit_native, x29 br x30 4: - tramp_alias x30, tramp_exit_compat + tramp_alias x30, tramp_exit_compat, x29 br x30 #endif .else @@ -1365,7 +1368,7 @@ alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline + tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline, tmp=x3 br x5 #endif ENDPROC(__sdei_asm_handler) -- Gitee From c13aede9aa9d5f2fa76794185836401f14c2d76c Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:36 +0000 Subject: [PATCH 244/340] arm64: entry: Don't assume tramp_vectors is the start of the vectors stable inclusion from stable-v4.19.236 commit af484e69b5e83095609d8b5c8abaf13a5460229e category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit ed50da7764535f1e24432ded289974f2bf2b0c5a upstream. The tramp_ventry macro uses tramp_vectors as the address of the vectors when calculating which ventry in the 'full fat' vectors to branch to. While there is one set of tramp_vectors, this will be true. Adding multiple sets of vectors will break this assumption. Move the generation of the vectors to a macro, and pass the start of the vectors as an argument to tramp_ventry. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 15e117d7e290..1dc27e6503e0 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1075,7 +1075,7 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, regsize = 64 + .macro tramp_ventry, vector_start, regsize .align 7 1: .if \regsize == 64 @@ -1097,9 +1097,9 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 #else ldr x30, =vectors #endif - prfm plil1strm, [x30, #(1b - tramp_vectors)] + prfm plil1strm, [x30, #(1b - \vector_start)] msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors + 4) + add x30, x30, #(1b - \vector_start + 4) isb ret .org 1b + 128 // Did we overflow the ventry slot? @@ -1117,19 +1117,21 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 eret .endm - .align 11 -ENTRY(tramp_vectors) + .macro generate_tramp_vector +.Lvector_start\@: .space 0x400 - tramp_ventry - tramp_ventry - tramp_ventry - tramp_ventry + .rept 4 + tramp_ventry .Lvector_start\@, 64 + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32 + .endr + .endm - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 + .align 11 +ENTRY(tramp_vectors) + generate_tramp_vector END(tramp_vectors) ENTRY(tramp_exit_native) -- Gitee From 8b3358cfec8da4ac2837cdf5a7dc4ff9c05ac98a Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:37 +0000 Subject: [PATCH 245/340] arm64: entry: Move trampoline macros out of ifdef'd section stable inclusion from stable-v4.19.236 commit f689fa53bb944873f75fe1584f446cae1aabd2c1 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 13d7a08352a83ef2252aeb464a5e08dfc06b5dfd upstream. The macros for building the kpti trampoline are all behind CONFIG_UNMAP_KERNEL_AT_EL0, and in a region that outputs to the .entry.tramp.text section. Move the macros out so they can be used to generate other kinds of trampoline. Only the symbols need to be guarded by CONFIG_UNMAP_KERNEL_AT_EL0 and appear in the .entry.tramp.text section. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 1dc27e6503e0..f9a89b146600 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1033,12 +1033,7 @@ ENDPROC(el0_svc) .popsection // .entry.text -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -/* - * Exception vectors trampoline. - */ - .pushsection ".entry.tramp.text", "ax" - + // Move from tramp_pg_dir to swapper_pg_dir .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 add \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) @@ -1129,6 +1124,11 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 .endr .endm +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + */ + .pushsection ".entry.tramp.text", "ax" .align 11 ENTRY(tramp_vectors) generate_tramp_vector -- Gitee From dc80b5a135110ad0ae99c2bf01b653d1d907feb2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:38 +0000 Subject: [PATCH 246/340] arm64: entry: Make the kpti trampoline's kpti sequence optional stable inclusion from stable-v4.19.236 commit 9e056623dfc538909ed2a914f70a66d68ec71ec3 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit c47e4d04ba0f1ea17353d85d45f611277507e07a upstream. Spectre-BHB needs to add sequences to the vectors. Having one global set of vectors is a problem for big/little systems where the sequence is costly on cpus that are not vulnerable. Making the vectors per-cpu in the style of KVM's bh_harden_hyp_vecs requires the vectors to be generated by macros. Make the kpti re-mapping of the kernel optional, so the macros can be used without kpti. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index f9a89b146600..8500997474c4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1070,9 +1070,10 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, vector_start, regsize + .macro tramp_ventry, vector_start, regsize, kpti .align 7 1: + .if \kpti == 1 .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif @@ -1094,8 +1095,12 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 #endif prfm plil1strm, [x30, #(1b - \vector_start)] msr vbar_el1, x30 - add x30, x30, #(1b - \vector_start + 4) isb + .else + ldr x30, =vectors + .endif // \kpti == 1 + + add x30, x30, #(1b - \vector_start + 4) ret .org 1b + 128 // Did we overflow the ventry slot? .endm @@ -1112,15 +1117,15 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 eret .endm - .macro generate_tramp_vector + .macro generate_tramp_vector, kpti .Lvector_start\@: .space 0x400 .rept 4 - tramp_ventry .Lvector_start\@, 64 + tramp_ventry .Lvector_start\@, 64, \kpti .endr .rept 4 - tramp_ventry .Lvector_start\@, 32 + tramp_ventry .Lvector_start\@, 32, \kpti .endr .endm @@ -1131,7 +1136,7 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 .pushsection ".entry.tramp.text", "ax" .align 11 ENTRY(tramp_vectors) - generate_tramp_vector + generate_tramp_vector kpti=1 END(tramp_vectors) ENTRY(tramp_exit_native) -- Gitee From 6c8a2f25a1c12217d8e1cdc3ee7320e628fc4013 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:39 +0000 Subject: [PATCH 247/340] arm64: entry: Allow the trampoline text to occupy multiple pages stable inclusion from stable-v4.19.236 commit 22fdfcf1c2cea8e6dc383d46cbbe59d476d24a96 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit a9c406e6462ff14956d690de7bbe5131a5677dc9 upstream. Adding a second set of vectors to .entry.tramp.text will make it larger than a single 4K page. Allow the trampoline text to occupy up to three pages by adding two more fixmap slots. Previous changes to tramp_valias allowed it to reach beyond a single page. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/include/asm/fixmap.h | 6 ++++-- arch/arm64/include/asm/sections.h | 5 +++++ arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/vmlinux.lds.S | 2 +- arch/arm64/mm/mmu.c | 12 +++++++++--- 5 files changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index c0cfc6d3bf9f..3c962ef081f8 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -59,9 +59,11 @@ enum fixed_addresses { #endif /* CONFIG_ACPI_APEI_GHES */ #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - FIX_ENTRY_TRAMP_TEXT, + FIX_ENTRY_TRAMP_TEXT3, + FIX_ENTRY_TRAMP_TEXT2, + FIX_ENTRY_TRAMP_TEXT1, FIX_ENTRY_TRAMP_DATA, -#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) +#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT1)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index caab039d6305..8d3f1eab58e0 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -30,4 +30,9 @@ extern char __irqentry_text_start[], __irqentry_text_end[]; extern char __mmuoff_data_start[], __mmuoff_data_end[]; extern char __entry_tramp_text_start[], __entry_tramp_text_end[]; +static inline size_t entry_tramp_text_size(void) +{ + return __entry_tramp_text_end - __entry_tramp_text_start; +} + #endif /* __ASM_SECTIONS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8500997474c4..2c34f5d9db0a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1066,7 +1066,7 @@ alternative_else_nop_endif .endm .macro tramp_data_page dst - adr \dst, .entry.tramp.text + adr_l \dst, .entry.tramp.text sub \dst, \dst, PAGE_SIZE .endm diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index d6050c6e65bc..2b965dd67881 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -251,7 +251,7 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE, "Entry trampoline text too big") #endif /* diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 300972b01939..fa53bee52f87 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -541,6 +541,8 @@ early_param("rodata", parse_rodata); #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static int __init map_entry_trampoline(void) { + int i; + pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); @@ -549,11 +551,15 @@ static int __init map_entry_trampoline(void) /* Map only the text into the trampoline page table */ memset(tramp_pg_dir, 0, PGD_SIZE); - __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, - prot, pgd_pgtable_alloc, 0); + __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, + entry_tramp_text_size(), prot, pgd_pgtable_alloc, + 0); /* Map both the text and data into the kernel page table */ - __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) + __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, + pa_start + i * PAGE_SIZE, prot); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern char __entry_tramp_data_start[]; -- Gitee From 4a7a38c5266e2d0dd02b81beb4b6de1d658893ec Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:40 +0000 Subject: [PATCH 248/340] arm64: entry: Add non-kpti __bp_harden_el1_vectors for mitigations stable inclusion from stable-v4.19.236 commit 901c0a20aa94d09a9328899e2dd69a8d43a3a920 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit aff65393fa1401e034656e349abd655cfe272de0 upstream. kpti is an optional feature, for systems not using kpti a set of vectors for the spectre-bhb mitigations is needed. Add another set of vectors, __bp_harden_el1_vectors, that will be used if a mitigation is needed and kpti is not in use. The EL1 ventries are repeated verbatim as there is no additional work needed for entry from EL1. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2c34f5d9db0a..6f787ae3bb56 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1073,10 +1073,11 @@ alternative_else_nop_endif .macro tramp_ventry, vector_start, regsize, kpti .align 7 1: - .if \kpti == 1 .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy * entry onto the return stack and using a RET instruction to @@ -1159,6 +1160,38 @@ __entry_tramp_data_start: #endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ +/* + * Exception vectors for spectre mitigations on entry from EL1 when + * kpti is not in use. + */ + .macro generate_el1_vector +.Lvector_start\@: + kernel_ventry 1, sync_invalid // Synchronous EL1t + kernel_ventry 1, irq_invalid // IRQ EL1t + kernel_ventry 1, fiq_invalid // FIQ EL1t + kernel_ventry 1, error_invalid // Error EL1t + + kernel_ventry 1, sync // Synchronous EL1h + kernel_ventry 1, irq // IRQ EL1h + kernel_ventry 1, fiq_invalid // FIQ EL1h + kernel_ventry 1, error // Error EL1h + + .rept 4 + tramp_ventry .Lvector_start\@, 64, kpti=0 + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, kpti=0 + .endr + .endm + + .pushsection ".entry.text", "ax" + .align 11 +ENTRY(__bp_harden_el1_vectors) + generate_el1_vector +END(__bp_harden_el1_vectors) + .popsection + + /* * Register switch for AArch64. The callee-saved registers need to be saved * and restored. On entry: -- Gitee From fef55b74167e2d00dd5027df583967d205fdf543 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:41 +0000 Subject: [PATCH 249/340] arm64: entry: Add vectors that have the bhb mitigation sequences stable inclusion from stable-v4.19.236 commit 91429ed04ebe9dbec88f97c6fd136b722bc3f3c5 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit ba2689234be92024e5635d30fe744f4853ad97db upstream. Some CPUs affected by Spectre-BHB need a sequence of branches, or a firmware call to be run before any indirect branch. This needs to go in the vectors. No CPU needs both. While this can be patched in, it would run on all CPUs as there is a single set of vectors. If only one part of a big/little combination is affected, the unaffected CPUs have to run the mitigation too. Create extra vectors that include the sequence. Subsequent patches will allow affected CPUs to select this set of vectors. Later patches will modify the loop count to match what the CPU requires. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/include/asm/assembler.h | 25 ++++++++++++++ arch/arm64/include/asm/vectors.h | 34 +++++++++++++++++++ arch/arm64/kernel/entry.S | 53 +++++++++++++++++++++++++----- include/linux/arm-smccc.h | 7 ++++ 4 files changed, 110 insertions(+), 9 deletions(-) create mode 100644 arch/arm64/include/asm/vectors.h diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index becb5454151e..3fbe273e84ad 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -703,4 +703,29 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU .Lyield_out_\@ : .endm + .macro __mitigate_spectre_bhb_loop tmp +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + mov \tmp, #32 +.Lspectre_bhb_loop\@: + b . + 4 + subs \tmp, \tmp, #1 + b.ne .Lspectre_bhb_loop\@ + dsb nsh + isb +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + + /* Save/restores x0-x3 to the stack */ + .macro __mitigate_spectre_bhb_fw +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + stp x0, x1, [sp, #-16]! + stp x2, x3, [sp, #-16]! + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 +alternative_cb arm64_update_smccc_conduit + nop // Patched to SMC/HVC #0 +alternative_cb_end + ldp x2, x3, [sp], #16 + ldp x0, x1, [sp], #16 +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h new file mode 100644 index 000000000000..16ca74260375 --- /dev/null +++ b/arch/arm64/include/asm/vectors.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 ARM Ltd. + */ +#ifndef __ASM_VECTORS_H +#define __ASM_VECTORS_H + +/* + * Note: the order of this enum corresponds to two arrays in entry.S: + * tramp_vecs and __bp_harden_el1_vectors. By default the canonical + * 'full fat' vectors are used directly. + */ +enum arm64_bp_harden_el1_vectors { +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + /* + * Perform the BHB loop mitigation, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_LOOP, + + /* + * Make the SMC call for firmware mitigation, before branching to the + * canonical vectors. + */ + EL1_VECTOR_BHB_FW, +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + + /* + * Remap the kernel before branching to the canonical vectors. + */ + EL1_VECTOR_KPTI, +}; + +#endif /* __ASM_VECTORS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6f787ae3bb56..07670da037be 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1070,13 +1070,26 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, vector_start, regsize, kpti + +#define BHB_MITIGATION_NONE 0 +#define BHB_MITIGATION_LOOP 1 +#define BHB_MITIGATION_FW 2 + + .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + .if \bhb == BHB_MITIGATION_LOOP + /* + * This sequence must appear before the first indirect branch. i.e. the + * ret out of tramp_ventry. It appears here because x30 is free. + */ + __mitigate_spectre_bhb_loop x30 + .endif // \bhb == BHB_MITIGATION_LOOP + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy @@ -1101,6 +1114,15 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 ldr x30, =vectors .endif // \kpti == 1 + .if \bhb == BHB_MITIGATION_FW + /* + * The firmware sequence must appear before the first indirect branch. + * i.e. the ret out of tramp_ventry. But it also needs the stack to be + * mapped to save/restore the registers the SMC clobbers. + */ + __mitigate_spectre_bhb_fw + .endif // \bhb == BHB_MITIGATION_FW + add x30, x30, #(1b - \vector_start + 4) ret .org 1b + 128 // Did we overflow the ventry slot? @@ -1108,6 +1130,9 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 .macro tramp_exit, regsize = 64 adr x30, tramp_vectors +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + add x30, x30, SZ_4K +#endif msr vbar_el1, x30 ldr lr, [sp, #S_LR] tramp_unmap_kernel x29 @@ -1118,26 +1143,32 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 eret .endm - .macro generate_tramp_vector, kpti + .macro generate_tramp_vector, kpti, bhb .Lvector_start\@: .space 0x400 .rept 4 - tramp_ventry .Lvector_start\@, 64, \kpti + tramp_ventry .Lvector_start\@, 64, \kpti, \bhb .endr .rept 4 - tramp_ventry .Lvector_start\@, 32, \kpti + tramp_ventry .Lvector_start\@, 32, \kpti, \bhb .endr .endm #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 /* * Exception vectors trampoline. + * The order must match __bp_harden_el1_vectors and the + * arm64_bp_harden_el1_vectors enum. */ .pushsection ".entry.tramp.text", "ax" .align 11 ENTRY(tramp_vectors) - generate_tramp_vector kpti=1 +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE END(tramp_vectors) ENTRY(tramp_exit_native) @@ -1164,7 +1195,7 @@ __entry_tramp_data_start: * Exception vectors for spectre mitigations on entry from EL1 when * kpti is not in use. */ - .macro generate_el1_vector + .macro generate_el1_vector, bhb .Lvector_start\@: kernel_ventry 1, sync_invalid // Synchronous EL1t kernel_ventry 1, irq_invalid // IRQ EL1t @@ -1177,17 +1208,21 @@ __entry_tramp_data_start: kernel_ventry 1, error // Error EL1h .rept 4 - tramp_ventry .Lvector_start\@, 64, kpti=0 + tramp_ventry .Lvector_start\@, 64, 0, \bhb .endr .rept 4 - tramp_ventry .Lvector_start\@, 32, kpti=0 + tramp_ventry .Lvector_start\@, 32, 0, \bhb .endr .endm +/* The order must match tramp_vecs and the arm64_bp_harden_el1_vectors enum. */ .pushsection ".entry.text", "ax" .align 11 ENTRY(__bp_harden_el1_vectors) - generate_el1_vector +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_el1_vector bhb=BHB_MITIGATION_LOOP + generate_el1_vector bhb=BHB_MITIGATION_FW +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ END(__bp_harden_el1_vectors) .popsection diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index a378a1d53fb9..8d3eb27e6890 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -86,6 +86,13 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define ARM_SMCCC_ARCH_WORKAROUND_3 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, 0x3fff) + +#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 + #ifndef __ASSEMBLY__ #include -- Gitee From a68ead8779f9ac6b092949a9e0d7d5eee7337e21 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:42 +0000 Subject: [PATCH 250/340] arm64: entry: Add macro for reading symbol addresses from the trampoline stable inclusion from stable-v4.19.236 commit e18876b523d5f5fd8b8f34721f60a470caf20aa1 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit b28a8eebe81c186fdb1a0078263b30576c8e1f42 upstream. The trampoline code needs to use the address of symbols in the wider kernel, e.g. vectors. PC-relative addressing wouldn't work as the trampoline code doesn't run at the address the linker expected. tramp_ventry uses a literal pool, unless CONFIG_RANDOMIZE_BASE is set, in which case it uses the data page as a literal pool because the data page can be unmapped when running in user-space, which is required for CPUs vulnerable to meltdown. Pull this logic out as a macro, instead of adding a third copy of it. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/kernel/entry.S | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 07670da037be..ceef7427e412 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1070,6 +1070,15 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm + .macro tramp_data_read_var dst, var +#ifdef CONFIG_RANDOMIZE_BASE + tramp_data_page \dst + add \dst, \dst, #:lo12:__entry_tramp_data_\var + ldr \dst, [\dst] +#else + ldr \dst, =\var +#endif + .endm #define BHB_MITIGATION_NONE 0 #define BHB_MITIGATION_LOOP 1 @@ -1100,13 +1109,8 @@ alternative_else_nop_endif b . 2: tramp_map_kernel x30 -#ifdef CONFIG_RANDOMIZE_BASE - tramp_data_page x30 alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 - ldr x30, [x30] -#else - ldr x30, =vectors -#endif + tramp_data_read_var x30, vectors prfm plil1strm, [x30, #(1b - \vector_start)] msr vbar_el1, x30 isb @@ -1186,7 +1190,12 @@ END(tramp_exit_compat) .align PAGE_SHIFT .globl __entry_tramp_data_start __entry_tramp_data_start: +__entry_tramp_data_vectors: .quad vectors +#ifdef CONFIG_ARM_SDE_INTERFACE +__entry_tramp_data___sdei_asm_handler: + .quad __sdei_asm_handler +#endif /* CONFIG_ARM_SDE_INTERFACE */ .popsection // .rodata #endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ @@ -1313,13 +1322,7 @@ ENTRY(__sdei_asm_entry_trampoline) */ 1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] -#ifdef CONFIG_RANDOMIZE_BASE - tramp_data_page x4 - add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler - ldr x4, [x4] -#else - ldr x4, =__sdei_asm_handler -#endif + tramp_data_read_var x4, __sdei_asm_handler br x4 ENDPROC(__sdei_asm_entry_trampoline) NOKPROBE(__sdei_asm_entry_trampoline) @@ -1342,12 +1345,6 @@ ENDPROC(__sdei_asm_exit_trampoline) NOKPROBE(__sdei_asm_exit_trampoline) .ltorg .popsection // .entry.tramp.text -#ifdef CONFIG_RANDOMIZE_BASE -.pushsection ".rodata", "a" -__sdei_asm_trampoline_next_handler: - .quad __sdei_asm_handler -.popsection // .rodata -#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* -- Gitee From e829d824f4fa3f1e0aea8e02d309868c4f419334 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:43 +0000 Subject: [PATCH 251/340] arm64: Add percpu vectors for EL1 stable inclusion from stable-v4.19.236 commit 5b5ca2608fbd6f250281b6a1d0d73613f250e6f1 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit bd09128d16fac3c34b80bd6a29088ac632e8ce09 upstream. The Spectre-BHB workaround adds a firmware call to the vectors. This is needed on some CPUs, but not others. To avoid the unaffected CPU in a big/little pair from making the firmware call, create per cpu vectors. The per-cpu vectors only apply when returning from EL0. Systems using KPTI can use the canonical 'full-fat' vectors directly at EL1, the trampoline exit code will switch to this_cpu_vector on exit to EL0. Systems not using KPTI should always use this_cpu_vector. this_cpu_vector will point at a vector in tramp_vecs or __bp_harden_el1_vectors, depending on whether KPTI is in use. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Conflicts: arch/arm64/kernel/cpufeature.c arch/arm64/kvm/hyp/switch.c Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/include/asm/mmu.h | 2 +- arch/arm64/include/asm/vectors.h | 27 +++++++++++++++++++++++++++ arch/arm64/kernel/cpufeature.c | 11 +++++++++++ arch/arm64/kernel/entry.S | 16 ++++++++++------ arch/arm64/kvm/hyp/switch.c | 8 ++++++-- 5 files changed, 55 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 3e02efc9cdd0..d6d47fa7623d 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -50,7 +50,7 @@ extern int res_mem_count; */ #define ASID(mm) ((mm)->context.id.counter & 0xffff) -static inline bool arm64_kernel_unmapped_at_el0(void) +static __always_inline bool arm64_kernel_unmapped_at_el0(void) { return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0); diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index 16ca74260375..3f76dfd9e074 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -5,6 +5,15 @@ #ifndef __ASM_VECTORS_H #define __ASM_VECTORS_H +#include +#include + +#include + +extern char vectors[]; +extern char tramp_vectors[]; +extern char __bp_harden_el1_vectors[]; + /* * Note: the order of this enum corresponds to two arrays in entry.S: * tramp_vecs and __bp_harden_el1_vectors. By default the canonical @@ -31,4 +40,22 @@ enum arm64_bp_harden_el1_vectors { EL1_VECTOR_KPTI, }; +/* The vectors to use on return from EL0. e.g. to remap the kernel */ +DECLARE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector); + +#ifndef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_VALIAS 0 +#endif + +static inline const char * +arm64_get_bp_hardening_vector(enum arm64_bp_harden_el1_vectors slot) +{ + if (arm64_kernel_unmapped_at_el0()) + return (char *)TRAMP_VALIAS + SZ_2K * slot; + + WARN_ON_ONCE(slot == EL1_VECTOR_KPTI); + + return __bp_harden_el1_vectors + SZ_2K * slot; +} + #endif /* __ASM_VECTORS_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 1bf9d84265de..f25f0dfed813 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -20,11 +20,13 @@ #include #include +#include #include #include #include #include #include + #include #include #include @@ -33,6 +35,7 @@ #include #include #include +#include #include unsigned long elf_hwcap __read_mostly; @@ -54,6 +57,8 @@ EXPORT_SYMBOL(cpu_hwcaps); /* Need also bit for ARM64_CB_PATCH */ DECLARE_BITMAP(boot_capabilities, ARM64_NPATCHABLE); +DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; + /* * Flag to indicate if we have computed the system wide * capabilities based on the boot time active CPUs. This @@ -1038,6 +1043,12 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) static bool kpti_applied = false; int cpu = smp_processor_id(); + if (__this_cpu_read(this_cpu_vector) == vectors) { + const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); + + __this_cpu_write(this_cpu_vector, v); + } + if (kpti_applied) return; diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ceef7427e412..65f66020985a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -71,7 +71,6 @@ .macro kernel_ventry, el, label, regsize = 64 .align 7 .Lventry_start\@: -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 .if \el == 0 /* * This must be the first instruction of the EL0 vector entries. It is @@ -86,7 +85,6 @@ .endif .Lskip_tramp_vectors_cleanup\@: .endif -#endif sub sp, sp, #S_FRAME_SIZE #ifdef CONFIG_VMAP_STACK @@ -1133,10 +1131,14 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 .endm .macro tramp_exit, regsize = 64 - adr x30, tramp_vectors -#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY - add x30, x30, SZ_4K -#endif + tramp_data_read_var x30, this_cpu_vector +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN + mrs x29, tpidr_el1 +alternative_else + mrs x29, tpidr_el2 +alternative_endif + ldr x30, [x30, x29] + msr vbar_el1, x30 ldr lr, [sp, #S_LR] tramp_unmap_kernel x29 @@ -1196,6 +1198,8 @@ __entry_tramp_data_vectors: __entry_tramp_data___sdei_asm_handler: .quad __sdei_asm_handler #endif /* CONFIG_ARM_SDE_INTERFACE */ +__entry_tramp_data_this_cpu_vector: + .quad this_cpu_vector .popsection // .rodata #endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index ae473600cb97..6473c7c96b14 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -35,6 +35,7 @@ #include #include #include +#include /* Check whether the FP regs were dirtied while in the host-side run loop: */ static bool __hyp_text update_fp_enabled(struct kvm_vcpu *vcpu) @@ -153,10 +154,13 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu) static void deactivate_traps_vhe(void) { - extern char vectors[]; /* kernel exception vectors */ + const char *host_vectors = vectors; write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); - write_sysreg(vectors, vbar_el1); + + if (!arm64_kernel_unmapped_at_el0()) + host_vectors = __this_cpu_read(this_cpu_vector); + write_sysreg(host_vectors, vbar_el1); } NOKPROBE_SYMBOL(deactivate_traps_vhe); -- Gitee From 39da68e72aa79e494eb74eb94163f75ca2075d52 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:44 +0000 Subject: [PATCH 252/340] arm64: proton-pack: Report Spectre-BHB vulnerabilities as part of Spectre-v2 stable inclusion from stable-v4.19.236 commit 7b012f6597e55a2ea4c7efe94b5d9a792b6e5757 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit dee435be76f4117410bbd90573a881fd33488f37 upstream. Speculation attacks against some high-performance processors can make use of branch history to influence future speculation as part of a spectre-v2 attack. This is not mitigated by CSV2, meaning CPUs that previously reported 'Not affected' are now moderately mitigated by CSV2. Update the value in /sys/devices/system/cpu/vulnerabilities/spectre_v2 to also show the state of the BHB mitigation. Reviewed-by: Catalin Marinas [ code move to cpu_errata.c for backport ] Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Conflicts: arch/arm64/include/asm/cpufeature.h Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/include/asm/cpufeature.h | 9 +++++++ arch/arm64/kernel/cpu_errata.c | 38 ++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index c43d02365fb6..0833197ea5c1 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -549,6 +549,15 @@ static inline int arm64_get_ssbd_state(void) void arm64_set_ssbd_mitigation(bool state); +/* Watch out, ordering is important here. */ +enum mitigation_state { + SPECTRE_UNAFFECTED, + SPECTRE_MITIGATED, + SPECTRE_VULNERABLE, +}; + +enum mitigation_state arm64_get_spectre_bhb_state(void); + static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index a6f2451b4469..b25072519102 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -957,14 +957,39 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, return sprintf(buf, "Mitigation: __user pointer sanitization\n"); } +static const char *get_bhb_affected_string(enum mitigation_state bhb_state) +{ + switch (bhb_state) { + case SPECTRE_UNAFFECTED: + return ""; + default: + case SPECTRE_VULNERABLE: + return ", but not BHB"; + case SPECTRE_MITIGATED: + return ", BHB"; + } +} + ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { - if (__spectrev2_safe) - return sprintf(buf, "Not affected\n"); + enum mitigation_state bhb_state = arm64_get_spectre_bhb_state(); + const char *bhb_str = get_bhb_affected_string(bhb_state); + const char *v2_str = "Branch predictor hardening"; + + if (__spectrev2_safe) { + if (bhb_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "Not affected\n"); + + /* + * Platforms affected by Spectre-BHB can't report + * "Not affected" for Spectre-v2. + */ + v2_str = "CSV2"; + } if (__hardenbp_enab) - return sprintf(buf, "Mitigation: Branch predictor hardening\n"); + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); return sprintf(buf, "Vulnerable\n"); } @@ -985,3 +1010,10 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, return sprintf(buf, "Vulnerable\n"); } + +static enum mitigation_state spectre_bhb_state; + +enum mitigation_state arm64_get_spectre_bhb_state(void) +{ + return spectre_bhb_state; +} -- Gitee From def2df575d30cc95c57ffd46939803f93cae489e Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:45 +0000 Subject: [PATCH 253/340] KVM: arm64: Add templates for BHB mitigation sequences stable inclusion from stable-v4.19.236 commit a68912a3ae3413be5febcaa40e7e0ec1fd62adee category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- KVM writes the Spectre-v2 mitigation template at the beginning of each vector when a CPU requires a specific sequence to run. Because the template is copied, it can not be modified by the alternatives at runtime. As the KVM template code is intertwined with the bp-hardening callbacks, all templates must have a bp-hardening callback. Add templates for calling ARCH_WORKAROUND_3 and one for each value of K in the brancy-loop. Identify these sequences by a new parameter template_start, and add a copy of install_bp_hardening_cb() that is able to install them. Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Conflicts: arch/arm64/include/asm/cpucaps.h Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/include/asm/cpucaps.h | 2 + arch/arm64/include/asm/kvm_mmu.h | 6 ++- arch/arm64/include/asm/mmu.h | 6 +++ arch/arm64/kernel/cpu_errata.c | 65 +++++++++++++++++++++++++++++++- arch/arm64/kvm/hyp/hyp-entry.S | 54 ++++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 69515f9077e3..f6d2ed5341a4 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -64,4 +64,6 @@ #define ARM64_NCAPS 38 #endif +#define ARM64_SPECTRE_BHB 40 + #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 399228f9f254..f8f00c054776 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -488,7 +488,8 @@ static inline void *kvm_get_hyp_vector(void) void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); int slot = -1; - if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) { + if ((cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) || + cpus_have_const_cap(ARM64_SPECTRE_BHB)) && data->template_start) { vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs_start)); slot = data->hyp_vectors_slot; } @@ -517,7 +518,8 @@ static inline int kvm_map_vectors(void) * !HBP + HEL2 -> allocate one vector slot and use exec mapping * HBP + HEL2 -> use hardened vertors and use exec mapping */ - if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) { + if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) || + cpus_have_const_cap(ARM64_SPECTRE_BHB)) { __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs_start); __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base); } diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index d6d47fa7623d..fac1a43f4ac3 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -61,6 +61,12 @@ typedef void (*bp_hardening_cb_t)(void); struct bp_hardening_data { int hyp_vectors_slot; bp_hardening_cb_t fn; + + /* + * template_start is only used by the BHB mitigation to identify the + * hyp_vectors_slot sequence. + */ + const char *template_start; }; #if (defined(CONFIG_HARDEN_BRANCH_PREDICTOR) || \ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index b25072519102..82ad16b59366 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -147,6 +147,14 @@ DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); #ifdef CONFIG_KVM_INDIRECT_VECTORS extern char __smccc_workaround_1_smc_start[]; extern char __smccc_workaround_1_smc_end[]; +extern char __smccc_workaround_3_smc_start[]; +extern char __smccc_workaround_3_smc_end[]; +extern char __spectre_bhb_loop_k8_start[]; +extern char __spectre_bhb_loop_k8_end[]; +extern char __spectre_bhb_loop_k24_start[]; +extern char __spectre_bhb_loop_k24_end[]; +extern char __spectre_bhb_loop_k32_start[]; +extern char __spectre_bhb_loop_k32_end[]; static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, const char *hyp_vecs_end) @@ -160,11 +168,11 @@ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); } +static DEFINE_SPINLOCK(bp_lock); static void install_bp_hardening_cb(bp_hardening_cb_t fn, const char *hyp_vecs_start, const char *hyp_vecs_end) { - static DEFINE_SPINLOCK(bp_lock); int cpu, slot = -1; spin_lock(&bp_lock); @@ -183,6 +191,7 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn, __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); __this_cpu_write(bp_hardening_data.fn, fn); + __this_cpu_write(bp_hardening_data.template_start, hyp_vecs_start); spin_unlock(&bp_lock); } #else @@ -1017,3 +1026,57 @@ enum mitigation_state arm64_get_spectre_bhb_state(void) { return spectre_bhb_state; } + +#ifdef CONFIG_KVM_INDIRECT_VECTORS +static const char *kvm_bhb_get_vecs_end(const char *start) +{ + if (start == __smccc_workaround_3_smc_start) + return __smccc_workaround_3_smc_end; + else if (start == __spectre_bhb_loop_k8_start) + return __spectre_bhb_loop_k8_end; + else if (start == __spectre_bhb_loop_k24_start) + return __spectre_bhb_loop_k24_end; + else if (start == __spectre_bhb_loop_k32_start) + return __spectre_bhb_loop_k32_end; + + return NULL; +} + +void kvm_setup_bhb_slot(const char *hyp_vecs_start) +{ + int cpu, slot = -1; + const char *hyp_vecs_end; + + if (!IS_ENABLED(CONFIG_KVM) || !is_hyp_mode_available()) + return; + + hyp_vecs_end = kvm_bhb_get_vecs_end(hyp_vecs_start); + if (WARN_ON_ONCE(!hyp_vecs_start || !hyp_vecs_end)) + return; + + spin_lock(&bp_lock); + for_each_possible_cpu(cpu) { + if (per_cpu(bp_hardening_data.template_start, cpu) == hyp_vecs_start) { + slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); + break; + } + } + + if (slot == -1) { + slot = atomic_inc_return(&arm64_el2_vector_last_slot); + BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); + __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); + } + + __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); + __this_cpu_write(bp_hardening_data.template_start, hyp_vecs_start); + spin_unlock(&bp_lock); +} +#else +#define __smccc_workaround_3_smc_start NULL +#define __spectre_bhb_loop_k8_start NULL +#define __spectre_bhb_loop_k24_start NULL +#define __spectre_bhb_loop_k32_start NULL + +void kvm_setup_bhb_slot(const char *hyp_vecs_start) { }; +#endif diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index b1f14f736962..acdf1e51e51c 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -329,4 +329,58 @@ ENTRY(__smccc_workaround_1_smc_start) ldp x0, x1, [sp, #(8 * 2)] add sp, sp, #(8 * 4) ENTRY(__smccc_workaround_1_smc_end) + +ENTRY(__smccc_workaround_3_smc_start) + esb + sub sp, sp, #(8 * 4) + stp x2, x3, [sp, #(8 * 0)] + stp x0, x1, [sp, #(8 * 2)] + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 + smc #0 + ldp x2, x3, [sp, #(8 * 0)] + ldp x0, x1, [sp, #(8 * 2)] + add sp, sp, #(8 * 4) +ENTRY(__smccc_workaround_3_smc_end) + +ENTRY(__spectre_bhb_loop_k8_start) + esb + sub sp, sp, #(8 * 2) + stp x0, x1, [sp, #(8 * 0)] + mov x0, #8 +2: b . + 4 + subs x0, x0, #1 + b.ne 2b + dsb nsh + isb + ldp x0, x1, [sp, #(8 * 0)] + add sp, sp, #(8 * 2) +ENTRY(__spectre_bhb_loop_k8_end) + +ENTRY(__spectre_bhb_loop_k24_start) + esb + sub sp, sp, #(8 * 2) + stp x0, x1, [sp, #(8 * 0)] + mov x0, #24 +2: b . + 4 + subs x0, x0, #1 + b.ne 2b + dsb nsh + isb + ldp x0, x1, [sp, #(8 * 0)] + add sp, sp, #(8 * 2) +ENTRY(__spectre_bhb_loop_k24_end) + +ENTRY(__spectre_bhb_loop_k32_start) + esb + sub sp, sp, #(8 * 2) + stp x0, x1, [sp, #(8 * 0)] + mov x0, #32 +2: b . + 4 + subs x0, x0, #1 + b.ne 2b + dsb nsh + isb + ldp x0, x1, [sp, #(8 * 0)] + add sp, sp, #(8 * 2) +ENTRY(__spectre_bhb_loop_k32_end) #endif -- Gitee From cb917077efa4fecd975ab17d6aececdd90abe82e Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:46 +0000 Subject: [PATCH 254/340] arm64: Mitigate spectre style branch history side channels stable inclusion from stable-v4.19.236 commit c20d551744797000c4af993f7d59ef8c69732949 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 558c303c9734af5a813739cd284879227f7297d2 upstream. Speculation attacks against some high-performance processors can make use of branch history to influence future speculation. When taking an exception from user-space, a sequence of branches or a firmware call overwrites or invalidates the branch history. The sequence of branches is added to the vectors, and should appear before the first indirect branch. For systems using KPTI the sequence is added to the kpti trampoline where it has a free register as the exit from the trampoline is via a 'ret'. For systems not using KPTI, the same register tricks are used to free up a register in the vectors. For the firmware call, arch-workaround-3 clobbers 4 registers, so there is no choice but to save them to the EL1 stack. This only happens for entry from EL0, so if we take an exception due to the stack access, it will not become re-entrant. For KVM, the existing branch-predictor-hardening vectors are used. When a spectre version of these vectors is in use, the firmware call is sufficient to mitigate against Spectre-BHB. For the non-spectre versions, the sequence of branches is added to the indirect vector. Reviewed-by: Catalin Marinas Cc: # # # Signed-off-by: Greg Kroah-Hartman Conflicts: arch/arm64/Kconfig arch/arm64/include/asm/cpufeature.h arch/arm64/include/asm/cputype.h arch/arm64/kernel/cpu_errata.c Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/Kconfig | 9 + arch/arm64/include/asm/assembler.h | 4 +- arch/arm64/include/asm/cpufeature.h | 18 ++ arch/arm64/include/asm/cputype.h | 9 + arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/asm/vectors.h | 6 + arch/arm64/kernel/cpu_errata.c | 263 +++++++++++++++++++++++++++- arch/arm64/kvm/hyp/hyp-entry.S | 4 + 8 files changed, 311 insertions(+), 3 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 02c2f528cb60..80ab9c9dd43c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1079,6 +1079,15 @@ config ARM64_SSBD If unsure, say Y. +config MITIGATE_SPECTRE_BRANCH_HISTORY + bool "Mitigate Spectre style attacks against branch history" if EXPERT + default y + help + Speculation attacks against some high-performance processors can + make use of branch history to influence future speculation. + When taking an exception from user-space, a sequence of branches + or a firmware call overwrites the branch history. + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" depends on AARCH32_EL0 diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 3fbe273e84ad..eeac442eb81a 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -705,7 +705,9 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU .macro __mitigate_spectre_bhb_loop tmp #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY - mov \tmp, #32 +alternative_cb spectre_bhb_patch_loop_iter + mov \tmp, #32 // Patched to correct the immediate +alternative_cb_end .Lspectre_bhb_loop\@: b . + 4 subs \tmp, \tmp, #1 diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 0833197ea5c1..1a3f528487d2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -492,6 +492,21 @@ static inline bool cpu_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); } +static inline bool supports_csv2p3(int scope) +{ + u64 pfr0; + u8 csv2_val; + + if (scope == SCOPE_LOCAL_CPU) + pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); + else + pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + csv2_val = cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_CSV2_SHIFT); + return csv2_val == 3; +} + static inline bool system_supports_32bit_el0(void) { return cpus_have_const_cap(ARM64_HAS_32BIT_EL0); @@ -557,6 +572,9 @@ enum mitigation_state { }; enum mitigation_state arm64_get_spectre_bhb_state(void); +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +u8 spectre_bhb_loop_affected(int scope); +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index aa3879612130..852474568a9e 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -81,6 +81,10 @@ #define ARM_CPU_PART_CORTEX_A35 0xD04 #define ARM_CPU_PART_CORTEX_A55 0xD05 #define ARM_CPU_PART_CORTEX_A76 0xD0B +#define ARM_CPU_PART_NEOVERSE_V1 0xD40 +#define ARM_CPU_PART_CORTEX_A78 0xD41 +#define ARM_CPU_PART_CORTEX_X1 0xD44 +#define ARM_CPU_PART_CORTEX_A78C 0xD4B #define APM_CPU_PART_POTENZA 0x000 @@ -115,6 +119,11 @@ #define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35) #define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55) #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) +#define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) +#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) +#define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) +#define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) + #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index db16c4844de6..cc866be7569d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -586,6 +586,7 @@ #endif /* id_aa64mmfr1 */ +#define ID_AA64MMFR1_ECBHB_SHIFT 60 #define ID_AA64MMFR1_PAN_SHIFT 20 #define ID_AA64MMFR1_LOR_SHIFT 16 #define ID_AA64MMFR1_HPD_SHIFT 12 diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index 3f76dfd9e074..f222d8e033b3 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -9,6 +9,7 @@ #include #include +#include extern char vectors[]; extern char tramp_vectors[]; @@ -40,6 +41,11 @@ enum arm64_bp_harden_el1_vectors { EL1_VECTOR_KPTI, }; +#ifndef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +#define EL1_VECTOR_BHB_LOOP -1 +#define EL1_VECTOR_BHB_FW -1 +#endif /* !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + /* The vectors to use on return from EL0. e.g. to remap the kernel */ DECLARE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 82ad16b59366..37ddd14ff85f 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_HISILICON_ERRATUM_HIP08_RU_PREFETCH #include #include @@ -940,6 +941,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .matches = has_ssbd_mitigation, .midr_range_list = arm64_ssb_cpus, }, + { + .desc = "Spectre-BHB", + .capability = ARM64_SPECTRE_BHB, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = is_spectre_bhb_affected, + .cpu_enable = spectre_bhb_enable_mitigation, + }, #ifdef CONFIG_ARM64_ERRATUM_1463225 { .desc = "ARM erratum 1463225", @@ -1020,6 +1028,33 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, return sprintf(buf, "Vulnerable\n"); } +/* + * We try to ensure that the mitigation state can never change as the result of + * onlining a late CPU. + */ +static void update_mitigation_state(enum mitigation_state *oldp, + enum mitigation_state new) +{ + enum mitigation_state state; + + do { + state = READ_ONCE(*oldp); + if (new <= state) + break; + } while (cmpxchg_relaxed(oldp, state, new) != state); +} + +/* + * Spectre BHB. + * + * A CPU is either: + * - Mitigated by a branchy loop a CPU specific number of times, and listed + * in our "loop mitigated list". + * - Mitigated in software by the firmware Spectre v2 call. + * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no + * software mitigation in the vectors is needed. + * - Has CSV2.3, so is unaffected. + */ static enum mitigation_state spectre_bhb_state; enum mitigation_state arm64_get_spectre_bhb_state(void) @@ -1027,6 +1062,158 @@ enum mitigation_state arm64_get_spectre_bhb_state(void) return spectre_bhb_state; } +/* + * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any + * SCOPE_SYSTEM call will give the right answer. + */ +u8 spectre_bhb_loop_affected(int scope) +{ + u8 k = 0; + static u8 max_bhb_k; + + if (scope == SCOPE_LOCAL_CPU) { + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) + k = 8; + + max_bhb_k = max(max_bhb_k, k); + } else { + k = max_bhb_k; + } + + return k; +} + +static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + if (psci_ops.smccc_version == SMCCC_VERSION_1_0) + return SPECTRE_VULNERABLE; + + switch (psci_ops.conduit) { + case PSCI_CONDUIT_HVC: + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + break; + + case PSCI_CONDUIT_SMC: + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + break; + + default: + return SPECTRE_VULNERABLE; + } + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +static bool is_spectre_bhb_fw_affected(int scope) +{ + static bool system_affected; + enum mitigation_state fw_state; + bool has_smccc = (psci_ops.smccc_version >= SMCCC_VERSION_1_1); + static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + {}, + }; + bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), + spectre_bhb_firmware_mitigated_list); + + if (scope != SCOPE_LOCAL_CPU) + return system_affected; + + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { + system_affected = true; + return true; + } + + return false; +} + +static bool supports_ecbhb(int scope) +{ + u64 mmfr1; + + if (scope == SCOPE_LOCAL_CPU) + mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1); + else + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_ECBHB_SHIFT); +} + +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, + int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (supports_csv2p3(scope)) + return false; + + if (spectre_bhb_loop_affected(scope)) + return true; + + if (is_spectre_bhb_fw_affected(scope)) + return true; + + return false; +} + +static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) +{ + const char *v = arm64_get_bp_hardening_vector(slot); + + if (slot < 0) + return; + + __this_cpu_write(this_cpu_vector, v); + + /* + * When KPTI is in use, the vectors are switched when exiting to + * user-space. + */ + if (arm64_kernel_unmapped_at_el0()) + return; + + write_sysreg(v, vbar_el1); + isb(); +} + #ifdef CONFIG_KVM_INDIRECT_VECTORS static const char *kvm_bhb_get_vecs_end(const char *start) { @@ -1042,7 +1229,7 @@ static const char *kvm_bhb_get_vecs_end(const char *start) return NULL; } -void kvm_setup_bhb_slot(const char *hyp_vecs_start) +static void kvm_setup_bhb_slot(const char *hyp_vecs_start) { int cpu, slot = -1; const char *hyp_vecs_end; @@ -1078,5 +1265,77 @@ void kvm_setup_bhb_slot(const char *hyp_vecs_start) #define __spectre_bhb_loop_k24_start NULL #define __spectre_bhb_loop_k32_start NULL -void kvm_setup_bhb_slot(const char *hyp_vecs_start) { }; +static void kvm_setup_bhb_slot(const char *hyp_vecs_start) { }; #endif + +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) +{ + enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + + if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) + return; + + if (!__spectrev2_safe && !__hardenbp_enab) { + /* No point mitigating Spectre-BHB alone. */ + } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { + pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); + } else if (cpu_mitigations_off()) { + pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { + state = SPECTRE_MITIGATED; + } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + switch (spectre_bhb_loop_affected(SCOPE_SYSTEM)) { + case 8: + kvm_setup_bhb_slot(__spectre_bhb_loop_k8_start); + break; + case 24: + kvm_setup_bhb_slot(__spectre_bhb_loop_k24_start); + break; + case 32: + kvm_setup_bhb_slot(__spectre_bhb_loop_k32_start); + break; + default: + WARN_ON_ONCE(1); + } + this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); + + state = SPECTRE_MITIGATED; + } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (fw_state == SPECTRE_MITIGATED) { + kvm_setup_bhb_slot(__smccc_workaround_3_smc_start); + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * With WA3 in the vectors, the WA1 calls can be + * removed. + */ + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + } + } + + update_mitigation_state(&spectre_bhb_state, state); +} + +/* Patched to correct the immediate */ +void __init spectre_bhb_patch_loop_iter(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); +} diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index acdf1e51e51c..18f454d772f8 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -111,6 +111,10 @@ el1_hvc_guest: /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_1 ^ \ ARM_SMCCC_ARCH_WORKAROUND_2) + cbz w1, wa_epilogue + + eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_2 ^ \ + ARM_SMCCC_ARCH_WORKAROUND_3) cbnz w1, el1_trap #ifdef CONFIG_ARM64_SSBD -- Gitee From 8fa0eebcf9d26f0ffdb802cef0b071a71fb41473 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:47 +0000 Subject: [PATCH 255/340] KVM: arm64: Allow SMCCC_ARCH_WORKAROUND_3 to be discovered and migrated stable inclusion from stable-v4.19.236 commit 5f051d32b03f08a0507ac1afd7b9c0a30c8e5d59 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit a5905d6af492ee6a4a2205f0d550b3f931b03d03 upstream. KVM allows the guest to discover whether the ARCH_WORKAROUND SMCCC are implemented, and to preserve that state during migration through its firmware register interface. Add the necessary boiler plate for SMCCC_ARCH_WORKAROUND_3. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas [ kvm code moved to virt/kvm/arm, removed fw regs ABI. Added 32bit stub ] Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Conflicts: arch/arm/include/asm/kvm_host.h arch/arm64/include/asm/kvm_host.h virt/kvm/arm/psci.c Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/include/asm/kvm_host.h | 7 +++++++ arch/arm64/include/asm/kvm_host.h | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 0066de61f4c6..46a2e8636f86 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -405,4 +406,10 @@ static inline int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) return 0; } +static inline int kvm_arm_get_spectre_bhb_state(void) +{ + /* 32bit guests don't need firmware for this */ + return SPECTRE_VULNERABLE; /* aka SMCCC_RET_NOT_SUPPORTED */ +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 556351524748..bf03056e3751 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -625,4 +625,9 @@ void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_config_vm(struct kvm *kvm, unsigned long type); +static inline enum mitigation_state kvm_arm_get_spectre_bhb_state(void) +{ + return arm64_get_spectre_bhb_state(); +} + #endif /* __ARM64_KVM_HOST_H__ */ -- Gitee From 09501bea5e1c893c432b20ab318b3c4427054dcd Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Wed, 20 Apr 2022 11:53:48 +0000 Subject: [PATCH 256/340] arm64: add ID_AA64ISAR2_EL1 sys register stable inclusion from stable-v4.19.236 commit a44e7ddb5822b943cd50c5ad6a2541fb445d58bd category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 9e45365f1469ef2b934f9d035975dbc9ad352116 upstream. This is a new ID register, introduced in 8.7. Signed-off-by: Joey Gouly Cc: Will Deacon Cc: Marc Zyngier Cc: James Morse Cc: Alexandru Elisei Cc: Suzuki K Poulose Cc: Reiji Watanabe Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20211210165432.8106-3-joey.gouly@arm.com Signed-off-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/include/asm/cpu.h | 1 + arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kernel/cpufeature.c | 9 +++++++++ arch/arm64/kernel/cpuinfo.c | 1 + arch/arm64/kvm/sys_regs.c | 2 +- 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 88392272250e..3a9908a01219 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -36,6 +36,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64dfr1; u64 reg_id_aa64isar0; u64 reg_id_aa64isar1; + u64 reg_id_aa64isar2; u64 reg_id_aa64mmfr0; u64 reg_id_aa64mmfr1; u64 reg_id_aa64mmfr2; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index cc866be7569d..cc0e11463b0b 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -159,6 +159,7 @@ #define SYS_ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0) #define SYS_ID_AA64ISAR1_EL1 sys_reg(3, 0, 0, 6, 1) +#define SYS_ID_AA64ISAR2_EL1 sys_reg(3, 0, 0, 6, 2) #define SYS_ID_AA64MMFR0_EL1 sys_reg(3, 0, 0, 7, 0) #define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index f25f0dfed813..530c049770b2 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -153,6 +153,10 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), @@ -402,6 +406,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 6 */ ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), ARM64_FTR_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1), + ARM64_FTR_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2), /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), @@ -550,6 +555,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1); init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0); init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); + init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); @@ -667,6 +673,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64isar0, boot->reg_id_aa64isar0); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu, info->reg_id_aa64isar1, boot->reg_id_aa64isar1); + taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu, + info->reg_id_aa64isar2, boot->reg_id_aa64isar2); /* * Differing PARange support is fine as long as all peripherals and @@ -816,6 +824,7 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64MMFR2_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); + read_sysreg_case(SYS_ID_AA64ISAR2_EL1); read_sysreg_case(SYS_CNTFRQ_EL0); read_sysreg_case(SYS_CTR_EL0); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index b79589ab3070..005d88db1082 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -342,6 +342,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); + info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1); info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 29496548c56b..25caa9ec95e4 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1325,7 +1325,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* CRm=6 */ ID_SANITISED(ID_AA64ISAR0_EL1), ID_SANITISED(ID_AA64ISAR1_EL1), - ID_UNALLOCATED(6,2), + ID_SANITISED(ID_AA64ISAR2_EL1), ID_UNALLOCATED(6,3), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), -- Gitee From d275c04a841934b8305bbf13931deb07b1bc76f6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Apr 2022 11:53:49 +0000 Subject: [PATCH 257/340] arm64: Use the clearbhb instruction in mitigations stable inclusion from stable-v4.19.236 commit ed5dec3fae86f20db52930e1d9a7cc38403994cc category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 228a26b912287934789023b4132ba76065d9491c upstream. Future CPUs may implement a clearbhb instruction that is sufficient to mitigate SpectreBHB. CPUs that implement this instruction, but not CSV2.3 must be affected by Spectre-BHB. Add support to use this instruction as the BHB mitigation on CPUs that support it. The instruction is in the hint space, so it will be treated by a NOP as older CPUs. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas [ modified for stable: Use a KVM vector template instead of alternatives ] Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jiahao Reviewed-by: Hanjun Guo Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm64/include/asm/assembler.h | 7 +++++++ arch/arm64/include/asm/cpufeature.h | 13 +++++++++++++ arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/include/asm/vectors.h | 7 +++++++ arch/arm64/kernel/cpu_errata.c | 14 ++++++++++++++ arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/entry.S | 8 ++++++++ arch/arm64/kvm/hyp/hyp-entry.S | 6 ++++++ 8 files changed, 59 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index eeac442eb81a..214319505313 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -118,6 +118,13 @@ hint #20 .endm +/* + * Clear Branch History instruction + */ + .macro clearbhb + hint #22 + .endm + /* * Sanitise a 64-bit bounded index wrt speculation, returning zero if out * of bounds. diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 1a3f528487d2..bfb699957880 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -507,6 +507,19 @@ static inline bool supports_csv2p3(int scope) return csv2_val == 3; } +static inline bool supports_clearbhb(int scope) +{ + u64 isar2; + + if (scope == SCOPE_LOCAL_CPU) + isar2 = read_sysreg_s(SYS_ID_AA64ISAR2_EL1); + else + isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + + return cpuid_feature_extract_unsigned_field(isar2, + ID_AA64ISAR2_CLEARBHB_SHIFT); +} + static inline bool system_supports_32bit_el0(void) { return cpus_have_const_cap(ARM64_HAS_32BIT_EL0); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index cc0e11463b0b..0fd51d253648 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -528,6 +528,9 @@ #define ID_AA64ISAR1_JSCVT_SHIFT 12 #define ID_AA64ISAR1_DPB_SHIFT 0 +/* id_aa64isar2 */ +#define ID_AA64ISAR2_CLEARBHB_SHIFT 28 + /* id_aa64pfr0 */ #define ID_AA64PFR0_CSV3_SHIFT 60 #define ID_AA64PFR0_CSV2_SHIFT 56 diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index f222d8e033b3..695583b9a145 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -33,6 +33,12 @@ enum arm64_bp_harden_el1_vectors { * canonical vectors. */ EL1_VECTOR_BHB_FW, + + /* + * Use the ClearBHB instruction, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_CLEAR_INSN, #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ /* @@ -44,6 +50,7 @@ enum arm64_bp_harden_el1_vectors { #ifndef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY #define EL1_VECTOR_BHB_LOOP -1 #define EL1_VECTOR_BHB_FW -1 +#define EL1_VECTOR_BHB_CLEAR_INSN -1 #endif /* !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ /* The vectors to use on return from EL0. e.g. to remap the kernel */ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 37ddd14ff85f..fdff9c6d8969 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -156,6 +156,8 @@ extern char __spectre_bhb_loop_k24_start[]; extern char __spectre_bhb_loop_k24_end[]; extern char __spectre_bhb_loop_k32_start[]; extern char __spectre_bhb_loop_k32_end[]; +extern char __spectre_bhb_clearbhb_start[]; +extern char __spectre_bhb_clearbhb_end[]; static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, const char *hyp_vecs_end) @@ -1051,6 +1053,7 @@ static void update_mitigation_state(enum mitigation_state *oldp, * - Mitigated by a branchy loop a CPU specific number of times, and listed * in our "loop mitigated list". * - Mitigated in software by the firmware Spectre v2 call. + * - Has the ClearBHB instruction to perform the mitigation. * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no * software mitigation in the vectors is needed. * - Has CSV2.3, so is unaffected. @@ -1185,6 +1188,9 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, if (supports_csv2p3(scope)) return false; + if (supports_clearbhb(scope)) + return true; + if (spectre_bhb_loop_affected(scope)) return true; @@ -1225,6 +1231,8 @@ static const char *kvm_bhb_get_vecs_end(const char *start) return __spectre_bhb_loop_k24_end; else if (start == __spectre_bhb_loop_k32_start) return __spectre_bhb_loop_k32_end; + else if (start == __spectre_bhb_clearbhb_start) + return __spectre_bhb_clearbhb_end; return NULL; } @@ -1264,6 +1272,7 @@ static void kvm_setup_bhb_slot(const char *hyp_vecs_start) #define __spectre_bhb_loop_k8_start NULL #define __spectre_bhb_loop_k24_start NULL #define __spectre_bhb_loop_k32_start NULL +#define __spectre_bhb_clearbhb_start NULL static void kvm_setup_bhb_slot(const char *hyp_vecs_start) { }; #endif @@ -1282,6 +1291,11 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) } else if (cpu_mitigations_off()) { pr_info_once("spectre-bhb mitigation disabled by command line option\n"); } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { + state = SPECTRE_MITIGATED; + } else if (supports_clearbhb(SCOPE_LOCAL_CPU)) { + kvm_setup_bhb_slot(__spectre_bhb_clearbhb_start); + this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); + state = SPECTRE_MITIGATED; } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { switch (spectre_bhb_loop_affected(SCOPE_SYSTEM)) { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 530c049770b2..aa9178e9acc6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -154,6 +154,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_CLEARBHB_SHIFT, 4, 0), ARM64_FTR_END, }; diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 65f66020985a..7c231eb211a4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1081,6 +1081,7 @@ alternative_else_nop_endif #define BHB_MITIGATION_NONE 0 #define BHB_MITIGATION_LOOP 1 #define BHB_MITIGATION_FW 2 +#define BHB_MITIGATION_INSN 3 .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 @@ -1097,6 +1098,11 @@ alternative_else_nop_endif __mitigate_spectre_bhb_loop x30 .endif // \bhb == BHB_MITIGATION_LOOP + .if \bhb == BHB_MITIGATION_INSN + clearbhb + isb + .endif // \bhb == BHB_MITIGATION_INSN + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy @@ -1173,6 +1179,7 @@ ENTRY(tramp_vectors) #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_INSN #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE END(tramp_vectors) @@ -1235,6 +1242,7 @@ ENTRY(__bp_harden_el1_vectors) #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY generate_el1_vector bhb=BHB_MITIGATION_LOOP generate_el1_vector bhb=BHB_MITIGATION_FW + generate_el1_vector bhb=BHB_MITIGATION_INSN #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ END(__bp_harden_el1_vectors) .popsection diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 18f454d772f8..454012f16c00 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -387,4 +387,10 @@ ENTRY(__spectre_bhb_loop_k32_start) ldp x0, x1, [sp, #(8 * 0)] add sp, sp, #(8 * 2) ENTRY(__spectre_bhb_loop_k32_end) + +ENTRY(__spectre_bhb_clearbhb_start) + esb + clearbhb + isb +ENTRY(__spectre_bhb_clearbhb_end) #endif -- Gitee From b453c348528392ae14f37d2b35ee63bea5cb9b0b Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Thu, 21 Apr 2022 02:40:42 +0000 Subject: [PATCH 258/340] af_key: add __GFP_ZERO flag for compose_sadb_supported in function pfkey_register stable inclusion from stable-v4.19.238 commit 693fe8af9a2625139de07bd1ae212a7d89c37795 category: bugfix bugzilla: 186606, https://gitee.com/src-openeuler/kernel/issues/I53SSV CVE: CVE-2022-1353 -------------------------------- [ Upstream commit 9a564bccb78a76740ea9d75a259942df8143d02c ] Add __GFP_ZERO flag for compose_sadb_supported in function pfkey_register to initialize the buffer of supp_skb to fix a kernel-info-leak issue. 1) Function pfkey_register calls compose_sadb_supported to request a sk_buff. 2) compose_sadb_supported calls alloc_sbk to allocate a sk_buff, but it doesn't zero it. 3) If auth_len is greater 0, then compose_sadb_supported treats the memory as a struct sadb_supported and begins to initialize. But it just initializes the field sadb_supported_len and field sadb_supported_exttype without field sadb_supported_reserved. Reported-by: TCS Robot Signed-off-by: Haimin Zhang Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin Signed-off-by: Xu Jia Reviewed-by: Wei Yongjun Reviewed-by: Wang Weiyang Signed-off-by: Yongqiang Liu --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index c7d5a6015389..8ce513e08fd7 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1709,7 +1709,7 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad xfrm_probe_algs(); - supp_skb = compose_sadb_supported(hdr, GFP_KERNEL); + supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<sadb_msg_satype); -- Gitee From 7348b7ecb179fb5e2008a02932f7be4ba9117a03 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 22 Apr 2022 08:01:44 +0000 Subject: [PATCH 259/340] x86/entry/64: Fix unwind hints in kernel exit path stable inclusion from stable-v4.19.238 commit ac069a960d357a7495fd2a1cde28d9b03250cace category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I53TS3 CVE: NA -------------------------------- commit 1fb143634a38095b641a3a21220774799772dc4c upstream. In swapgs_restore_regs_and_return_to_usermode, after the stack is switched to the trampoline stack, the existing UNWIND_HINT_REGS hint is no longer valid, which can result in the following ORC unwinder warning: WARNING: can't dereference registers at 000000003aeb0cdd for ip swapgs_restore_regs_and_return_to_usermode+0x93/0xa0 For full correctness, we could try to add complicated unwind hints so the unwinder could continue to find the registers, but when when it's this close to kernel exit, unwind hints aren't really needed anymore and it's fine to just use an empty hint which tells the unwinder to stop. For consistency, also move the UNWIND_HINT_EMPTY in entry_SYSCALL_64_after_hwframe to a similar location. Fixes: 3e3b9293d392 ("x86/entry/64: Return to userspace from the trampoline stack") Reported-by: Vince Weaver Reported-by: Dave Jones Reported-by: Dr. David Alan Gilbert Reported-by: Joe Mario Reported-by: Jann Horn Reported-by: Linus Torvalds Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/60ea8f562987ed2d9ace2977502fe481c0d7c9a0.1587808742.git.jpoimboe@redhat.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zou Yipeng Reviewed-by: Zhang Jianhua Signed-off-by: Yongqiang Liu --- arch/x86/entry/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ccb5e3486aee..ca651113bea2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -312,7 +312,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - UNWIND_HINT_EMPTY POP_REGS pop_rdi=0 skip_r11rcx=1 /* @@ -321,6 +320,7 @@ syscall_return_via_sysret: */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -700,6 +700,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ -- Gitee From 5b5dc732dc8a8b7a4ee08f6f6ab17fdcba624599 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 22 Apr 2022 08:01:45 +0000 Subject: [PATCH 260/340] objtool: Fix stack offset tracking for indirect CFAs stable inclusion from stable-v4.19.238 commit 9fbfc77d0f5e04a7434fa6d32112036e2a41bc6e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I53TS3 CVE: NA -------------------------------- commit d8dd25a461e4eec7190cb9d66616aceacc5110ad upstream. When the current frame address (CFA) is stored on the stack (i.e., cfa->base == CFI_SP_INDIRECT), objtool neglects to adjust the stack offset when there are subsequent pushes or pops. This results in bad ORC data at the end of the ENTER_IRQ_STACK macro, when it puts the previous stack pointer on the stack and does a subsequent push. This fixes the following unwinder warning: WARNING: can't dereference registers at 00000000f0a6bdba for ip interrupt_entry+0x9f/0xa0 Fixes: 627fce14809b ("objtool: Add ORC unwind table generation") Reported-by: Vince Weaver Reported-by: Dave Jones Reported-by: Steven Rostedt Reported-by: Vegard Nossum Reported-by: Joe Mario Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/853d5d691b29e250333332f09b8e27410b2d9924.1587808742.git.jpoimboe@redhat.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zou Yipeng Reviewed-by: Zhang Jianhua Signed-off-by: Yongqiang Liu --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index ecf5fc77f50b..952273d7afd9 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1318,7 +1318,7 @@ static int update_insn_state_regs(struct instruction *insn, struct insn_state *s struct cfi_reg *cfa = &state->cfa; struct stack_op *op = &insn->stack_op; - if (cfa->base != CFI_SP) + if (cfa->base != CFI_SP && cfa->base != CFI_SP_INDIRECT) return 0; /* push */ -- Gitee From 5de027286b589c83daad07aed84c912d551606c5 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Fri, 22 Apr 2022 13:01:21 +0000 Subject: [PATCH 261/340] ax25: NPD bug when detaching AX25 device stable inclusion from linux-4.19.235 commit bd05a8f1b7368ef4f7845548312fc61ab4fa63ce category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1199 -------------------------------- commit 1ade48d0c27d5da1ccf4b583d8c5fc8b534a3ac8 upstream. The existing cleanup routine implementation is not well synchronized with the syscall routine. When a device is detaching, below race could occur. static int ax25_sendmsg(...) { ... lock_sock() ax25 = sk_to_ax25(sk); if (ax25->ax25_dev == NULL) // CHECK ... ax25_queue_xmit(skb, ax25->ax25_dev->dev); // USE ... } static void ax25_kill_by_device(...) { ... if (s->ax25_dev == ax25_dev) { s->ax25_dev = NULL; ... } Other syscall functions like ax25_getsockopt, ax25_getname, ax25_info_show also suffer from similar races. To fix them, this patch introduce lock_sock() into ax25_kill_by_device in order to guarantee that the nullify action in cleanup routine cannot proceed when another socket request is pending. Signed-off-by: Hanjie Wu Signed-off-by: Lin Ma Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zhengchao Shao Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 44ec492f3dc2..5d05d8dca168 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -88,8 +88,10 @@ static void ax25_kill_by_device(struct net_device *dev) again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { - s->ax25_dev = NULL; spin_unlock_bh(&ax25_list_lock); + lock_sock(s->sk); + s->ax25_dev = NULL; + release_sock(s->sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); -- Gitee From afbc39a55021ee84bb1dec7c07fc98a36d58e9f7 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 22 Apr 2022 13:01:22 +0000 Subject: [PATCH 262/340] ax25: improve the incomplete fix to avoid UAF and NPD bugs stable inclusion from linux-4.19.235 commit 3072e72814de56f3c674650a8af98233ddf78b19 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1199 -------------------------------- [ Upstream commit 4e0f718daf97d47cf7dec122da1be970f145c809 ] The previous commit 1ade48d0c27d ("ax25: NPD bug when detaching AX25 device") introduce lock_sock() into ax25_kill_by_device to prevent NPD bug. But the concurrency NPD or UAF bug will occur, when lock_sock() or release_sock() dereferences the ax25_cb->sock. The NULL pointer dereference bug can be shown as below: ax25_kill_by_device() | ax25_release() | ax25_destroy_socket() | ax25_cb_del() ... | ... | ax25->sk=NULL; lock_sock(s->sk); //(1) | s->ax25_dev = NULL; | ... release_sock(s->sk); //(2) | ... | The root cause is that the sock is set to null before dereference site (1) or (2). Therefore, this patch extracts the ax25_cb->sock in advance, and uses ax25_list_lock to protect it, which can synchronize with ax25_cb_del() and ensure the value of sock is not null before dereference sites. The concurrency UAF bug can be shown as below: ax25_kill_by_device() | ax25_release() | ax25_destroy_socket() ... | ... | sock_put(sk); //FREE lock_sock(s->sk); //(1) | s->ax25_dev = NULL; | ... release_sock(s->sk); //(2) | ... | The root cause is that the sock is released before dereference site (1) or (2). Therefore, this patch uses sock_hold() to increase the refcount of sock and uses ax25_list_lock to protect it, which can synchronize with ax25_cb_del() in ax25_destroy_socket() and ensure the sock wil not be released before dereference sites. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zhengchao Shao Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 5d05d8dca168..dccfd9c2086f 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -80,6 +80,7 @@ static void ax25_kill_by_device(struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *s; + struct sock *sk; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; @@ -88,13 +89,15 @@ static void ax25_kill_by_device(struct net_device *dev) again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { + sk = s->sk; + sock_hold(sk); spin_unlock_bh(&ax25_list_lock); - lock_sock(s->sk); + lock_sock(sk); s->ax25_dev = NULL; - release_sock(s->sk); + release_sock(sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); - + sock_put(sk); /* The entry could have been deleted from the * list meanwhile and thus the next pointer is * no longer valid. Play it safe and restart -- Gitee From 1cb76aa23d195bf9802e8d01891abd16a20c5cf1 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 22 Apr 2022 13:01:23 +0000 Subject: [PATCH 263/340] ax25: Fix NULL pointer dereference in ax25_kill_by_device stable inclusion from linux-4.19.235 commit 5ab8de9377edde3eaf1de9872e2f01d43157cd6c category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1199 -------------------------------- [ Upstream commit 71171ac8eb34ce7fe6b3267dce27c313ab3cb3ac ] When two ax25 devices attempted to establish connection, the requester use ax25_create(), ax25_bind() and ax25_connect() to initiate connection. The receiver use ax25_rcv() to accept connection and use ax25_create_cb() in ax25_rcv() to create ax25_cb, but the ax25_cb->sk is NULL. When the receiver is detaching, a NULL pointer dereference bug caused by sock_hold(sk) in ax25_kill_by_device() will happen. The corresponding fail log is shown below: =============================================================== BUG: KASAN: null-ptr-deref in ax25_device_event+0xfd/0x290 Call Trace: ... ax25_device_event+0xfd/0x290 raw_notifier_call_chain+0x5e/0x70 dev_close_many+0x174/0x220 unregister_netdevice_many+0x1f7/0xa60 unregister_netdevice_queue+0x12f/0x170 unregister_netdev+0x13/0x20 mkiss_close+0xcd/0x140 tty_ldisc_release+0xc0/0x220 tty_release_struct+0x17/0xa0 tty_release+0x62d/0x670 ... This patch add condition check in ax25_kill_by_device(). If s->sk is NULL, it will goto if branch to kill device. Fixes: 4e0f718daf97 ("ax25: improve the incomplete fix to avoid UAF and NPD bugs") Reported-by: Thomas Osterried Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zhengchao Shao Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index dccfd9c2086f..37a333d0ee03 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -90,6 +90,13 @@ static void ax25_kill_by_device(struct net_device *dev) ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { sk = s->sk; + if (!sk) { + spin_unlock_bh(&ax25_list_lock); + s->ax25_dev = NULL; + ax25_disconnect(s, ENETUNREACH); + spin_lock_bh(&ax25_list_lock); + goto again; + } sock_hold(sk); spin_unlock_bh(&ax25_list_lock); lock_sock(sk); -- Gitee From b65eebc6e33ad2a8ee77a59b8931c407d41be9e2 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 22 Apr 2022 13:01:24 +0000 Subject: [PATCH 264/340] ax25: fix NPD bug in ax25_disconnect mainline inclusion from mainline-v5.17-rc4 commit 7ec02f5ac8a5be5a3f20611731243dc5e1d9ba10 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1199 -------------------------------- The ax25_disconnect() in ax25_kill_by_device() is not protected by any locks, thus there is a race condition between ax25_disconnect() and ax25_destroy_socket(). when ax25->sk is assigned as NULL by ax25_destroy_socket(), a NULL pointer dereference bug will occur if site (1) or (2) dereferences ax25->sk. ax25_kill_by_device() | ax25_release() ax25_disconnect() | ax25_destroy_socket() ... | if(ax25->sk != NULL) | ... ... | ax25->sk = NULL; bh_lock_sock(ax25->sk); //(1) | ... ... | bh_unlock_sock(ax25->sk); //(2)| This patch moves ax25_disconnect() into lock_sock(), which can synchronize with ax25_destroy_socket() in ax25_release(). Fail log: =============================================================== BUG: kernel NULL pointer dereference, address: 0000000000000088 ... RIP: 0010:_raw_spin_lock+0x7e/0xd0 ... Call Trace: ax25_disconnect+0xf6/0x220 ax25_device_event+0x187/0x250 raw_notifier_call_chain+0x5e/0x70 dev_close_many+0x17d/0x230 rollback_registered_many+0x1f1/0x950 unregister_netdevice_queue+0x133/0x200 unregister_netdev+0x13/0x20 ... Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller Conflict: net/ax25/af_ax25.c Signed-off-by: Zhengchao Shao Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 37a333d0ee03..3a5812c1a098 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -101,8 +101,8 @@ static void ax25_kill_by_device(struct net_device *dev) spin_unlock_bh(&ax25_list_lock); lock_sock(sk); s->ax25_dev = NULL; - release_sock(sk); ax25_disconnect(s, ENETUNREACH); + release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); /* The entry could have been deleted from the -- Gitee From 98802b106005c318a9a364f943ca6bd1b1ef07ed Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 22 Apr 2022 13:01:25 +0000 Subject: [PATCH 265/340] ax25: Fix NULL pointer dereferences in ax25 timers mainline inclusion from mainline-v5.18-rc1 commit fc6d01ff9ef03b66d4a3a23b46fc3c3d8cf92009 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1205 -------------------------------- The previous commit 7ec02f5ac8a5 ("ax25: fix NPD bug in ax25_disconnect") move ax25_disconnect into lock_sock() in order to prevent NPD bugs. But there are race conditions that may lead to null pointer dereferences in ax25_heartbeat_expiry(), ax25_t1timer_expiry(), ax25_t2timer_expiry(), ax25_t3timer_expiry() and ax25_idletimer_expiry(), when we use ax25_kill_by_device() to detach the ax25 device. One of the race conditions that cause null pointer dereferences can be shown as below: (Thread 1) | (Thread 2) ax25_connect() | ax25_std_establish_data_link() | ax25_start_t1timer() | mod_timer(&ax25->t1timer,..) | | ax25_kill_by_device() (wait a time) | ... | s->ax25_dev = NULL; //(1) ax25_t1timer_expiry() | ax25->ax25_dev->values[..] //(2)| ... ... | We set null to ax25_cb->ax25_dev in position (1) and dereference the null pointer in position (2). The corresponding fail log is shown below: =============================================================== BUG: kernel NULL pointer dereference, address: 0000000000000050 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.17.0-rc6-00794-g45690b7d0 RIP: 0010:ax25_t1timer_expiry+0x12/0x40 ... Call Trace: call_timer_fn+0x21/0x120 __run_timers.part.0+0x1ca/0x250 run_timer_softirq+0x2c/0x60 __do_softirq+0xef/0x2f3 irq_exit_rcu+0xb6/0x100 sysvec_apic_timer_interrupt+0xa2/0xd0 ... This patch moves ax25_disconnect() before s->ax25_dev = NULL and uses del_timer_sync() to delete timers in ax25_disconnect(). If ax25_disconnect() is called by ax25_kill_by_device() or ax25->ax25_dev is NULL, the reason in ax25_disconnect() will be equal to ENETUNREACH, it will wait all timers to stop before we set null to s->ax25_dev in ax25_kill_by_device(). Fixes: 7ec02f5ac8a5 ("ax25: fix NPD bug in ax25_disconnect") Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller Conflict: net/ax25/af_ax25.c Signed-off-by: Zhengchao Shao Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 4 ++-- net/ax25/ax25_subr.c | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 3a5812c1a098..2b0bee2dbbb7 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -92,16 +92,16 @@ static void ax25_kill_by_device(struct net_device *dev) sk = s->sk; if (!sk) { spin_unlock_bh(&ax25_list_lock); - s->ax25_dev = NULL; ax25_disconnect(s, ENETUNREACH); + s->ax25_dev = NULL; spin_lock_bh(&ax25_list_lock); goto again; } sock_hold(sk); spin_unlock_bh(&ax25_list_lock); lock_sock(sk); - s->ax25_dev = NULL; ax25_disconnect(s, ENETUNREACH); + s->ax25_dev = NULL; release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 038b109b2be7..c129865cad9f 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -264,12 +264,20 @@ void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25); - if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) - ax25_stop_heartbeat(ax25); - ax25_stop_t1timer(ax25); - ax25_stop_t2timer(ax25); - ax25_stop_t3timer(ax25); - ax25_stop_idletimer(ax25); + if (reason == ENETUNREACH) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); + } else { + if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) + ax25_stop_heartbeat(ax25); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + } ax25->state = AX25_STATE_0; -- Gitee From 0eba1f6eb46b33dc5a220cd812b4f951f1ca4360 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Sun, 24 Apr 2022 02:21:32 +0000 Subject: [PATCH 266/340] Revert "perf: Paper over the hw.target problems" hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I53VHE CVE: NA -------------------------------- This reverts commit 0380474221530db9147a001034794a95fb4c46c1. This patch is used to solve race between close() and fork() of the perf. However, this patch is not accepted by the community. As a result, destory interface is incorrectly invoked during the perf_remove_from_context, causing UAF, see https://lkml.org/lkml/2019/6/28/856. For 4.19 kernel, he final fix patch has been incorporated, see eb41044bbece4. Therefore, need to revert the patch. Signed-off-by: Yang Jihong Reviewed-by: Kuohai Xu Signed-off-by: Yongqiang Liu --- kernel/events/core.c | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index deba52307349..8dc07a529e6d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2135,28 +2135,6 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); - /* - * This is as passable as any hw.target handling out there; - * hw.target implies task context, therefore, no migration. - * Which together with DETACH_GROUP means that this is the - * final remove_from_context of a task event. - */ - if (event->hw.target && (flags & DETACH_GROUP)) { - /* - * Now, the problem with, say uprobes, is that they - * use hw.target for context in their ->destroy() - * callbacks. Supposedly, they may need to poke at - * its contents, so better call it while we still - * have the task. - */ - if (event->destroy) { - event->destroy(event); - event->destroy = NULL; - } - put_task_struct(event->hw.target); - event->hw.target = NULL; - } - /* * The above event_function_call() can NO-OP when it hits * TASK_TOMBSTONE. In that case we must already have been detached -- Gitee From b1ee3b19cbd9901aa3965f5eb85a52fe2cb59cf1 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Tue, 26 Apr 2022 11:31:55 +0000 Subject: [PATCH 267/340] can: usb_8dev: usb_8dev_start_xmit(): fix double dev_kfree_skb() in error path mainline inclusion from mainline-v5.18-rc1 commit 3d3925ff6433f98992685a9679613a2cc97f3ce2 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I51YBQ CVE: CVE-2022-28388 ------------------------------------------------- There is no need to call dev_kfree_skb() when usb_submit_urb() fails because can_put_echo_skb() deletes original skb and can_free_echo_skb() deletes the cloned skb. Fixes: 0024d8ad1639 ("can: usb_8dev: Add support for USB2CAN interface from 8 devices") Link: https://lore.kernel.org/all/20220311080614.45229-1-hbh25y@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Hangyu Hua Signed-off-by: Marc Kleine-Budde Conflicts: drivers/net/can/usb/usb_8dev.c Signed-off-by: Ziyang Xuan Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/net/can/usb/usb_8dev.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index 3e4416473607..9cd67262a5ed 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -680,9 +680,20 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, atomic_inc(&priv->active_tx_urbs); err = usb_submit_urb(urb, GFP_ATOMIC); - if (unlikely(err)) - goto failed; - else if (atomic_read(&priv->active_tx_urbs) >= MAX_TX_URBS) + if (unlikely(err)) { + can_free_echo_skb(netdev, context->echo_index); + + usb_unanchor_urb(urb); + usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); + + atomic_dec(&priv->active_tx_urbs); + + if (err == -ENODEV) + netif_device_detach(netdev); + else + netdev_warn(netdev, "failed tx_urb %d\n", err); + stats->tx_dropped++; + } else if (atomic_read(&priv->active_tx_urbs) >= MAX_TX_URBS) /* Slow down tx path */ netif_stop_queue(netdev); @@ -701,19 +712,6 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, return NETDEV_TX_BUSY; -failed: - can_free_echo_skb(netdev, context->echo_index); - - usb_unanchor_urb(urb); - usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); - - atomic_dec(&priv->active_tx_urbs); - - if (err == -ENODEV) - netif_device_detach(netdev); - else - netdev_warn(netdev, "failed tx_urb %d\n", err); - nomembuf: usb_free_urb(urb); -- Gitee From 1f3f6c4947da96df9b723821a82634cdc9100d18 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Apr 2022 08:12:54 +0000 Subject: [PATCH 268/340] ARM: report Spectre v2 status through sysfs stable inclusion from stable-v4.19.234 commit dc64af755099d1e51fd64e99fe3a59b75595814a category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 9dd78194a3722fa6712192cdd4f7032d45112a9a upstream. As per other architectures, add support for reporting the Spectre vulnerability status via sysfs CPU. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) [ preserve res variable and add SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED - gregkh ] Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/include/asm/spectre.h | 28 +++++++ arch/arm/kernel/Makefile | 2 + arch/arm/kernel/spectre.c | 54 ++++++++++++++ arch/arm/mm/Kconfig | 1 + arch/arm/mm/proc-v7-bugs.c | 132 +++++++++++++++++++++++++-------- 5 files changed, 186 insertions(+), 31 deletions(-) create mode 100644 arch/arm/include/asm/spectre.h create mode 100644 arch/arm/kernel/spectre.c diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h new file mode 100644 index 000000000000..8a9019e08dba --- /dev/null +++ b/arch/arm/include/asm/spectre.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_SPECTRE_H +#define __ASM_SPECTRE_H + +enum { + SPECTRE_UNAFFECTED, + SPECTRE_MITIGATED, + SPECTRE_VULNERABLE, +}; + +enum { + __SPECTRE_V2_METHOD_BPIALL, + __SPECTRE_V2_METHOD_ICIALLU, + __SPECTRE_V2_METHOD_SMC, + __SPECTRE_V2_METHOD_HVC, +}; + +enum { + SPECTRE_V2_METHOD_BPIALL = BIT(__SPECTRE_V2_METHOD_BPIALL), + SPECTRE_V2_METHOD_ICIALLU = BIT(__SPECTRE_V2_METHOD_ICIALLU), + SPECTRE_V2_METHOD_SMC = BIT(__SPECTRE_V2_METHOD_SMC), + SPECTRE_V2_METHOD_HVC = BIT(__SPECTRE_V2_METHOD_HVC), +}; + +void spectre_v2_update_state(unsigned int state, unsigned int methods); + +#endif diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 8cad59465af3..70fd896c132a 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -102,4 +102,6 @@ endif obj-$(CONFIG_HAVE_ARM_SMCCC) += smccc-call.o +obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += spectre.o + extra-y := $(head-y) vmlinux.lds diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c new file mode 100644 index 000000000000..6f6dd1cfd099 --- /dev/null +++ b/arch/arm/kernel/spectre.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +static unsigned int spectre_v2_state; +static unsigned int spectre_v2_methods; + +void spectre_v2_update_state(unsigned int state, unsigned int method) +{ + if (state > spectre_v2_state) + spectre_v2_state = state; + spectre_v2_methods |= method; +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + const char *method; + + if (spectre_v2_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "%s\n", "Not affected"); + + if (spectre_v2_state != SPECTRE_MITIGATED) + return sprintf(buf, "%s\n", "Vulnerable"); + + switch (spectre_v2_methods) { + case SPECTRE_V2_METHOD_BPIALL: + method = "Branch predictor hardening"; + break; + + case SPECTRE_V2_METHOD_ICIALLU: + method = "I-cache invalidation"; + break; + + case SPECTRE_V2_METHOD_SMC: + case SPECTRE_V2_METHOD_HVC: + method = "Firmware call"; + break; + + default: + method = "Multiple mitigations"; + break; + } + + return sprintf(buf, "Mitigation: %s\n", method); +} diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index b169e580bf82..2069a1b583f0 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -823,6 +823,7 @@ config CPU_BPREDICT_DISABLE config CPU_SPECTRE bool + select GENERIC_CPU_VULNERABILITIES config HARDEN_BRANCH_PREDICTOR bool "Harden the branch predictor against aliasing attacks" if EXPERT diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index ac63a31347c1..7f9446c1f0cc 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -7,8 +7,36 @@ #include #include #include +#include #include +#ifdef CONFIG_ARM_PSCI +#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 +static int __maybe_unused spectre_v2_get_cpu_fw_mitigation_state(void) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + + switch ((int)res.a0) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + + default: + return SPECTRE_VULNERABLE; + } +} +#else +static int __maybe_unused spectre_v2_get_cpu_fw_mitigation_state(void) +{ + return SPECTRE_VULNERABLE; +} +#endif + #ifdef CONFIG_HARDEN_BRANCH_PREDICTOR DEFINE_PER_CPU(harden_branch_predictor_fn_t, harden_branch_predictor_fn); @@ -37,13 +65,60 @@ static void __maybe_unused call_hvc_arch_workaround_1(void) arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); } -static void cpu_v7_spectre_init(void) +static unsigned int spectre_v2_install_workaround(unsigned int method) { const char *spectre_v2_method = NULL; int cpu = smp_processor_id(); if (per_cpu(harden_branch_predictor_fn, cpu)) - return; + return SPECTRE_MITIGATED; + + switch (method) { + case SPECTRE_V2_METHOD_BPIALL: + per_cpu(harden_branch_predictor_fn, cpu) = + harden_branch_predictor_bpiall; + spectre_v2_method = "BPIALL"; + break; + + case SPECTRE_V2_METHOD_ICIALLU: + per_cpu(harden_branch_predictor_fn, cpu) = + harden_branch_predictor_iciallu; + spectre_v2_method = "ICIALLU"; + break; + + case SPECTRE_V2_METHOD_HVC: + per_cpu(harden_branch_predictor_fn, cpu) = + call_hvc_arch_workaround_1; + cpu_do_switch_mm = cpu_v7_hvc_switch_mm; + spectre_v2_method = "hypervisor"; + break; + + case SPECTRE_V2_METHOD_SMC: + per_cpu(harden_branch_predictor_fn, cpu) = + call_smc_arch_workaround_1; + cpu_do_switch_mm = cpu_v7_smc_switch_mm; + spectre_v2_method = "firmware"; + break; + } + + if (spectre_v2_method) + pr_info("CPU%u: Spectre v2: using %s workaround\n", + smp_processor_id(), spectre_v2_method); + + return SPECTRE_MITIGATED; +} +#else +static unsigned int spectre_v2_install_workaround(unsigned int method) +{ + pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n"); + + return SPECTRE_VULNERABLE; +} +#endif + +static void cpu_v7_spectre_v2_init(void) +{ + unsigned int state, method = 0; switch (read_cpuid_part()) { case ARM_CPU_PART_CORTEX_A8: @@ -52,32 +127,37 @@ static void cpu_v7_spectre_init(void) case ARM_CPU_PART_CORTEX_A17: case ARM_CPU_PART_CORTEX_A73: case ARM_CPU_PART_CORTEX_A75: - per_cpu(harden_branch_predictor_fn, cpu) = - harden_branch_predictor_bpiall; - spectre_v2_method = "BPIALL"; + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_BPIALL; break; case ARM_CPU_PART_CORTEX_A15: case ARM_CPU_PART_BRAHMA_B15: - per_cpu(harden_branch_predictor_fn, cpu) = - harden_branch_predictor_iciallu; - spectre_v2_method = "ICIALLU"; + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_ICIALLU; break; -#ifdef CONFIG_ARM_PSCI case ARM_CPU_PART_BRAHMA_B53: /* Requires no workaround */ + state = SPECTRE_UNAFFECTED; break; + default: /* Other ARM CPUs require no workaround */ - if (read_cpuid_implementor() == ARM_CPU_IMP_ARM) + if (read_cpuid_implementor() == ARM_CPU_IMP_ARM) { + state = SPECTRE_UNAFFECTED; break; + } /* fallthrough */ - /* Cortex A57/A72 require firmware workaround */ + /* Cortex A57/A72 require firmware workaround */ case ARM_CPU_PART_CORTEX_A57: case ARM_CPU_PART_CORTEX_A72: { struct arm_smccc_res res; + state = spectre_v2_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + break; + if (psci_ops.smccc_version == SMCCC_VERSION_1_0) break; @@ -88,35 +168,25 @@ static void cpu_v7_spectre_init(void) switch (psci_ops.conduit) { case PSCI_CONDUIT_HVC: - per_cpu(harden_branch_predictor_fn, cpu) = - call_hvc_arch_workaround_1; - cpu_do_switch_mm = cpu_v7_hvc_switch_mm; - spectre_v2_method = "hypervisor"; + method = SPECTRE_V2_METHOD_HVC; break; case PSCI_CONDUIT_SMC: - per_cpu(harden_branch_predictor_fn, cpu) = - call_smc_arch_workaround_1; - cpu_do_switch_mm = cpu_v7_smc_switch_mm; - spectre_v2_method = "firmware"; + method = SPECTRE_V2_METHOD_SMC; break; default: + state = SPECTRE_VULNERABLE; break; } } -#endif } - if (spectre_v2_method) - pr_info("CPU%u: Spectre v2: using %s workaround\n", - smp_processor_id(), spectre_v2_method); -} -#else -static void cpu_v7_spectre_init(void) -{ + if (state == SPECTRE_MITIGATED) + state = spectre_v2_install_workaround(method); + + spectre_v2_update_state(state, method); } -#endif static __maybe_unused bool cpu_v7_check_auxcr_set(bool *warned, u32 mask, const char *msg) @@ -146,16 +216,16 @@ static bool check_spectre_auxcr(bool *warned, u32 bit) void cpu_v7_ca8_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(6))) - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } void cpu_v7_ca15_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(0))) - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } void cpu_v7_bugs_init(void) { - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } -- Gitee From abcacb285da806e293dd7cf1653a0c159d438e9f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Apr 2022 08:12:55 +0000 Subject: [PATCH 269/340] ARM: early traps initialisation stable inclusion from stable-v4.19.234 commit 45c25917ceb7a5377883ef4c3a675276fba8a268 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 04e91b7324760a377a725e218b5ee783826d30f5 upstream. Provide a couple of helpers to copy the vectors and stubs, and also to flush the copied vectors and stubs. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/kernel/traps.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2a21239aec91..674cf872cccc 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -828,10 +828,22 @@ static inline void __init kuser_init(void *vectors) } #endif +#ifndef CONFIG_CPU_V7M +static void copy_from_lma(void *vma, void *lma_start, void *lma_end) +{ + memcpy(vma, lma_start, lma_end - lma_start); +} + +static void flush_vectors(void *vma, size_t offset, size_t size) +{ + unsigned long start = (unsigned long)vma + offset; + unsigned long end = start + size; + + flush_icache_range(start, end); +} + void __init early_trap_init(void *vectors_base) { -#ifndef CONFIG_CPU_V7M - unsigned long vectors = (unsigned long)vectors_base; extern char __stubs_start[], __stubs_end[]; extern char __vectors_start[], __vectors_end[]; unsigned i; @@ -852,17 +864,20 @@ void __init early_trap_init(void *vectors_base) * into the vector page, mapped at 0xffff0000, and ensure these * are visible to the instruction stream. */ - memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start); - memcpy((void *)vectors + 0x1000, __stubs_start, __stubs_end - __stubs_start); + copy_from_lma(vectors_base, __vectors_start, __vectors_end); + copy_from_lma(vectors_base + 0x1000, __stubs_start, __stubs_end); kuser_init(vectors_base); - flush_icache_range(vectors, vectors + PAGE_SIZE * 2); + flush_vectors(vectors_base, 0, PAGE_SIZE * 2); +} #else /* ifndef CONFIG_CPU_V7M */ +void __init early_trap_init(void *vectors_base) +{ /* * on V7-M there is no need to copy the vector table to a dedicated * memory area. The address is configurable and so a table in the kernel * image can be used. */ -#endif } +#endif -- Gitee From 6700224fee3048474967b952cfdafe3087837394 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Apr 2022 08:12:56 +0000 Subject: [PATCH 270/340] ARM: use LOADADDR() to get load address of sections stable inclusion from stable-v4.19.234 commit 67e1f18a972be16363c6e88d7b29cde880774164 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 8d9d651ff2270a632e9dc497b142db31e8911315 upstream. Use the linker's LOADADDR() macro to get the load address of the sections, and provide a macro to set the start and end symbols. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/kernel/vmlinux.lds.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/arm/kernel/vmlinux.lds.h b/arch/arm/kernel/vmlinux.lds.h index 8247bc15addc..64c052783514 100644 --- a/arch/arm/kernel/vmlinux.lds.h +++ b/arch/arm/kernel/vmlinux.lds.h @@ -25,6 +25,11 @@ #define ARM_MMU_DISCARD(x) x #endif +/* Set start/end symbol names to the LMA for the section */ +#define ARM_LMA(sym, section) \ + sym##_start = LOADADDR(section); \ + sym##_end = LOADADDR(section) + SIZEOF(section) + #define PROC_INFO \ . = ALIGN(4); \ __proc_info_begin = .; \ @@ -100,19 +105,19 @@ * only thing that matters is their relative offsets */ #define ARM_VECTORS \ - __vectors_start = .; \ + __vectors_lma = .; \ .vectors 0xffff0000 : AT(__vectors_start) { \ *(.vectors) \ } \ - . = __vectors_start + SIZEOF(.vectors); \ - __vectors_end = .; \ + ARM_LMA(__vectors, .vectors); \ + . = __vectors_lma + SIZEOF(.vectors); \ \ - __stubs_start = .; \ - .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_start) { \ + __stubs_lma = .; \ + .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_lma) { \ *(.stubs) \ } \ - . = __stubs_start + SIZEOF(.stubs); \ - __stubs_end = .; \ + ARM_LMA(__stubs, .stubs); \ + . = __stubs_lma + SIZEOF(.stubs); \ \ PROVIDE(vector_fiq_offset = vector_fiq - ADDR(.vectors)); -- Gitee From 7d063048fddf5f15eb48399a4bfdc8957399919d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Apr 2022 08:12:57 +0000 Subject: [PATCH 271/340] ARM: Spectre-BHB workaround stable inclusion from stable-v4.19.234 commit 99e14db3b711c27f93079ba9d7f2fff169916d5f category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit b9baf5c8c5c356757f4f9d8180b5e9d234065bc3 upstream. Workaround the Spectre BHB issues for Cortex-A15, Cortex-A57, Cortex-A72, Cortex-A73 and Cortex-A75. We also include Brahma B15 as well to be safe, which is affected by Spectre V2 in the same ways as Cortex-A15. Reviewed-by: Catalin Marinas Signed-off-by: Russell King (Oracle) [changes due to lack of SYSTEM_FREEING_INITMEM - gregkh] Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/include/asm/assembler.h | 10 ++++ arch/arm/include/asm/spectre.h | 4 ++ arch/arm/kernel/entry-armv.S | 79 +++++++++++++++++++++++++++++--- arch/arm/kernel/entry-common.S | 24 ++++++++++ arch/arm/kernel/spectre.c | 4 ++ arch/arm/kernel/traps.c | 38 +++++++++++++++ arch/arm/kernel/vmlinux.lds.h | 18 ++++++-- arch/arm/mm/Kconfig | 10 ++++ arch/arm/mm/proc-v7-bugs.c | 76 ++++++++++++++++++++++++++++++ 9 files changed, 254 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 88286dd483ff..36ce2c1c982f 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -110,6 +110,16 @@ .endm #endif +#if __LINUX_ARM_ARCH__ < 7 + .macro dsb, args + mcr p15, 0, r0, c7, c10, 4 + .endm + + .macro isb, args + mcr p15, 0, r0, c7, r5, 4 + .endm +#endif + .macro asm_trace_hardirqs_off, save=1 #if defined(CONFIG_TRACE_IRQFLAGS) .if \save diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h index 8a9019e08dba..d1fa5607d3aa 100644 --- a/arch/arm/include/asm/spectre.h +++ b/arch/arm/include/asm/spectre.h @@ -14,6 +14,7 @@ enum { __SPECTRE_V2_METHOD_ICIALLU, __SPECTRE_V2_METHOD_SMC, __SPECTRE_V2_METHOD_HVC, + __SPECTRE_V2_METHOD_LOOP8, }; enum { @@ -21,8 +22,11 @@ enum { SPECTRE_V2_METHOD_ICIALLU = BIT(__SPECTRE_V2_METHOD_ICIALLU), SPECTRE_V2_METHOD_SMC = BIT(__SPECTRE_V2_METHOD_SMC), SPECTRE_V2_METHOD_HVC = BIT(__SPECTRE_V2_METHOD_HVC), + SPECTRE_V2_METHOD_LOOP8 = BIT(__SPECTRE_V2_METHOD_LOOP8), }; void spectre_v2_update_state(unsigned int state, unsigned int methods); +int spectre_bhb_update_vectors(unsigned int method); + #endif diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index e85a3af9ddeb..ffcff378e3c8 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1038,12 +1038,11 @@ vector_\name: sub lr, lr, #\correction .endif - @ - @ Save r0, lr_ (parent PC) and spsr_ - @ (parent CPSR) - @ + @ Save r0, lr_ (parent PC) stmia sp, {r0, lr} @ save r0, lr - mrs lr, spsr + + @ Save spsr_ (parent CPSR) +2: mrs lr, spsr str lr, [sp, #8] @ save spsr @ @@ -1064,6 +1063,44 @@ vector_\name: movs pc, lr @ branch to handler in SVC mode ENDPROC(vector_\name) +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .subsection 1 + .align 5 +vector_bhb_loop8_\name: + .if \correction + sub lr, lr, #\correction + .endif + + @ Save r0, lr_ (parent PC) + stmia sp, {r0, lr} + + @ bhb workaround + mov r0, #8 +1: b . + 4 + subs r0, r0, #1 + bne 1b + dsb + isb + b 2b +ENDPROC(vector_bhb_loop8_\name) + +vector_bhb_bpiall_\name: + .if \correction + sub lr, lr, #\correction + .endif + + @ Save r0, lr_ (parent PC) + stmia sp, {r0, lr} + + @ bhb workaround + mcr p15, 0, r0, c7, c5, 6 @ BPIALL + @ isb not needed due to "movs pc, lr" in the vector stub + @ which gives a "context synchronisation". + b 2b +ENDPROC(vector_bhb_bpiall_\name) + .previous +#endif + .align 2 @ handler addresses follow this label 1: @@ -1072,6 +1109,10 @@ ENDPROC(vector_\name) .section .stubs, "ax", %progbits @ This must be the first word .word vector_swi +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .word vector_bhb_loop8_swi + .word vector_bhb_bpiall_swi +#endif vector_rst: ARM( swi SYS_ERROR0 ) @@ -1186,8 +1227,10 @@ vector_addrexcptn: * FIQ "NMI" handler *----------------------------------------------------------------------------- * Handle a FIQ using the SVC stack allowing FIQ act like NMI on x86 - * systems. + * systems. This must be the last vector stub, so lets place it in its own + * subsection. */ + .subsection 2 vector_stub fiq, FIQ_MODE, 4 .long __fiq_usr @ 0 (USR_26 / USR_32) @@ -1220,6 +1263,30 @@ vector_addrexcptn: W(b) vector_irq W(b) vector_fiq +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .section .vectors.bhb.loop8, "ax", %progbits +.L__vectors_bhb_loop8_start: + W(b) vector_rst + W(b) vector_bhb_loop8_und + W(ldr) pc, .L__vectors_bhb_loop8_start + 0x1004 + W(b) vector_bhb_loop8_pabt + W(b) vector_bhb_loop8_dabt + W(b) vector_addrexcptn + W(b) vector_bhb_loop8_irq + W(b) vector_bhb_loop8_fiq + + .section .vectors.bhb.bpiall, "ax", %progbits +.L__vectors_bhb_bpiall_start: + W(b) vector_rst + W(b) vector_bhb_bpiall_und + W(ldr) pc, .L__vectors_bhb_bpiall_start + 0x1008 + W(b) vector_bhb_bpiall_pabt + W(b) vector_bhb_bpiall_dabt + W(b) vector_addrexcptn + W(b) vector_bhb_bpiall_irq + W(b) vector_bhb_bpiall_fiq +#endif + .data .align 2 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 0465d65d23de..e27fc2df5231 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -165,6 +165,29 @@ ENDPROC(ret_from_fork) *----------------------------------------------------------------------------- */ + .align 5 +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +ENTRY(vector_bhb_loop8_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mov r8, #8 +1: b 2f +2: subs r8, r8, #1 + bne 1b + dsb + isb + b 3f +ENDPROC(vector_bhb_loop8_swi) + + .align 5 +ENTRY(vector_bhb_bpiall_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mcr p15, 0, r8, c7, c5, 6 @ BPIALL + isb + b 3f +ENDPROC(vector_bhb_bpiall_swi) +#endif .align 5 ENTRY(vector_swi) #ifdef CONFIG_CPU_V7M @@ -172,6 +195,7 @@ ENTRY(vector_swi) #else sub sp, sp, #PT_REGS_SIZE stmia sp, {r0 - r12} @ Calling r0 - r12 +3: ARM( add r8, sp, #S_PC ) ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr THUMB( mov r8, sp ) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index 6f6dd1cfd099..ade967f18d06 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -45,6 +45,10 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, method = "Firmware call"; break; + case SPECTRE_V2_METHOD_LOOP8: + method = "History overwrite"; + break; + default: method = "Multiple mitigations"; break; diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 674cf872cccc..4c61ac4f3d35 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -842,6 +843,43 @@ static void flush_vectors(void *vma, size_t offset, size_t size) flush_icache_range(start, end); } +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +int spectre_bhb_update_vectors(unsigned int method) +{ + extern char __vectors_bhb_bpiall_start[], __vectors_bhb_bpiall_end[]; + extern char __vectors_bhb_loop8_start[], __vectors_bhb_loop8_end[]; + void *vec_start, *vec_end; + + if (system_state > SYSTEM_SCHEDULING) { + pr_err("CPU%u: Spectre BHB workaround too late - system vulnerable\n", + smp_processor_id()); + return SPECTRE_VULNERABLE; + } + + switch (method) { + case SPECTRE_V2_METHOD_LOOP8: + vec_start = __vectors_bhb_loop8_start; + vec_end = __vectors_bhb_loop8_end; + break; + + case SPECTRE_V2_METHOD_BPIALL: + vec_start = __vectors_bhb_bpiall_start; + vec_end = __vectors_bhb_bpiall_end; + break; + + default: + pr_err("CPU%u: unknown Spectre BHB state %d\n", + smp_processor_id(), method); + return SPECTRE_VULNERABLE; + } + + copy_from_lma(vectors_page, vec_start, vec_end); + flush_vectors(vectors_page, 0, vec_end - vec_start); + + return SPECTRE_MITIGATED; +} +#endif + void __init early_trap_init(void *vectors_base) { extern char __stubs_start[], __stubs_end[]; diff --git a/arch/arm/kernel/vmlinux.lds.h b/arch/arm/kernel/vmlinux.lds.h index 64c052783514..99fb6fec623a 100644 --- a/arch/arm/kernel/vmlinux.lds.h +++ b/arch/arm/kernel/vmlinux.lds.h @@ -106,11 +106,23 @@ */ #define ARM_VECTORS \ __vectors_lma = .; \ - .vectors 0xffff0000 : AT(__vectors_start) { \ - *(.vectors) \ + OVERLAY 0xffff0000 : NOCROSSREFS AT(__vectors_lma) { \ + .vectors { \ + *(.vectors) \ + } \ + .vectors.bhb.loop8 { \ + *(.vectors.bhb.loop8) \ + } \ + .vectors.bhb.bpiall { \ + *(.vectors.bhb.bpiall) \ + } \ } \ ARM_LMA(__vectors, .vectors); \ - . = __vectors_lma + SIZEOF(.vectors); \ + ARM_LMA(__vectors_bhb_loop8, .vectors.bhb.loop8); \ + ARM_LMA(__vectors_bhb_bpiall, .vectors.bhb.bpiall); \ + . = __vectors_lma + SIZEOF(.vectors) + \ + SIZEOF(.vectors.bhb.loop8) + \ + SIZEOF(.vectors.bhb.bpiall); \ \ __stubs_lma = .; \ .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_lma) { \ diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 2069a1b583f0..096afa16b531 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -844,6 +844,16 @@ config HARDEN_BRANCH_PREDICTOR If unsure, say Y. +config HARDEN_BRANCH_HISTORY + bool "Harden Spectre style attacks against branch history" if EXPERT + depends on CPU_SPECTRE + default y + help + Speculation attacks against some high-performance processors can + make use of branch history to influence future speculation. When + taking an exception, a sequence of branches overwrites the branch + history, or branch history is invalidated. + config TLS_REG_EMUL bool select NEED_KUSER_HELPERS diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index 7f9446c1f0cc..e7557b88a995 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -188,6 +188,81 @@ static void cpu_v7_spectre_v2_init(void) spectre_v2_update_state(state, method); } +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +static int spectre_bhb_method; + +static const char *spectre_bhb_method_name(int method) +{ + switch (method) { + case SPECTRE_V2_METHOD_LOOP8: + return "loop"; + + case SPECTRE_V2_METHOD_BPIALL: + return "BPIALL"; + + default: + return "unknown"; + } +} + +static int spectre_bhb_install_workaround(int method) +{ + if (spectre_bhb_method != method) { + if (spectre_bhb_method) { + pr_err("CPU%u: Spectre BHB: method disagreement, system vulnerable\n", + smp_processor_id()); + + return SPECTRE_VULNERABLE; + } + + if (spectre_bhb_update_vectors(method) == SPECTRE_VULNERABLE) + return SPECTRE_VULNERABLE; + + spectre_bhb_method = method; + } + + pr_info("CPU%u: Spectre BHB: using %s workaround\n", + smp_processor_id(), spectre_bhb_method_name(method)); + + return SPECTRE_MITIGATED; +} +#else +static int spectre_bhb_install_workaround(int method) +{ + return SPECTRE_VULNERABLE; +} +#endif + +static void cpu_v7_spectre_bhb_init(void) +{ + unsigned int state, method = 0; + + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A15: + case ARM_CPU_PART_BRAHMA_B15: + case ARM_CPU_PART_CORTEX_A57: + case ARM_CPU_PART_CORTEX_A72: + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_LOOP8; + break; + + case ARM_CPU_PART_CORTEX_A73: + case ARM_CPU_PART_CORTEX_A75: + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_BPIALL; + break; + + default: + state = SPECTRE_UNAFFECTED; + break; + } + + if (state == SPECTRE_MITIGATED) + state = spectre_bhb_install_workaround(method); + + spectre_v2_update_state(state, method); +} + static __maybe_unused bool cpu_v7_check_auxcr_set(bool *warned, u32 mask, const char *msg) { @@ -228,4 +303,5 @@ void cpu_v7_ca15_ibe(void) void cpu_v7_bugs_init(void) { cpu_v7_spectre_v2_init(); + cpu_v7_spectre_bhb_init(); } -- Gitee From a16c8564c6f75602c0b4a2d55179d6867c972c48 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Apr 2022 08:12:58 +0000 Subject: [PATCH 272/340] ARM: include unprivileged BPF status in Spectre V2 reporting stable inclusion from stable-v4.19.234 commit 29db7e4b67fccf5e1fe28ec89f2add90ce74d77b category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 25875aa71dfefd1959f07e626c4d285b88b27ac2 upstream. The mitigations for Spectre-BHB are only applied when an exception is taken, but when unprivileged BPF is enabled, userspace can load BPF programs that can be used to exploit the problem. When unprivileged BPF is enabled, report the vulnerable status via the spectre_v2 sysfs file. Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/kernel/spectre.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index ade967f18d06..e7fea962d632 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -1,9 +1,19 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false +#endif +} + ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { @@ -31,6 +41,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, if (spectre_v2_state != SPECTRE_MITIGATED) return sprintf(buf, "%s\n", "Vulnerable"); + if (_unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + switch (spectre_v2_methods) { case SPECTRE_V2_METHOD_BPIALL: method = "Branch predictor hardening"; -- Gitee From 8534f17effd5031faad823a2967249a7cca36aa4 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Thu, 28 Apr 2022 08:12:59 +0000 Subject: [PATCH 273/340] ARM: fix build error when BPF_SYSCALL is disabled stable inclusion from stable-v4.19.234 commit fffcf80d055c4f619ae9cd8bf3a8b67319317100 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 330f4c53d3c2d8b11d86ec03a964b86dc81452f5 upstream. It was missing a semicolon. Signed-off-by: Emmanuel Gil Peyrot Reviewed-by: Nathan Chancellor Fixes: 25875aa71dfe ("ARM: include unprivileged BPF status in Spectre V2 reporting"). Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/kernel/spectre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index e7fea962d632..0dcefc36fb7a 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -10,7 +10,7 @@ static bool _unprivileged_ebpf_enabled(void) #ifdef CONFIG_BPF_SYSCALL return !sysctl_unprivileged_bpf_disabled; #else - return false + return false; #endif } -- Gitee From eda81c50f279aeb5a35dfaa0e2daec8bae1abf6f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Apr 2022 08:13:00 +0000 Subject: [PATCH 274/340] ARM: fix co-processor register typo stable inclusion from stable-v4.19.234 commit 8dd98efe9e3f20927a2970bed5974ca8c3646c92 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 33970b031dc4653cc9dc80f2886976706c4c8ef1 upstream. In the recent Spectre BHB patches, there was a typo that is only exposed in certain configurations: mcr p15,0,XX,c7,r5,4 should have been mcr p15,0,XX,c7,c5,4 Reported-by: kernel test robot Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Signed-off-by: Russell King (Oracle) Acked-by: Catalin Marinas Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/include/asm/assembler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 36ce2c1c982f..f903e1040b36 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -116,7 +116,7 @@ .endm .macro isb, args - mcr p15, 0, r0, c7, r5, 4 + mcr p15, 0, r0, c7, c5, 4 .endm #endif -- Gitee From d4d9408b275f034e2a7d392f81ea83ec755079c1 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 28 Apr 2022 08:13:01 +0000 Subject: [PATCH 275/340] ARM: Do not use NOCROSSREFS directive with ld.lld stable inclusion from stable-v4.19.234 commit 868d5f8a428f41da7f493a6744913b55c5c97ee2 category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 36168e387fa7d0f1fe0cd5cf76c8cea7aee714fa upstream. ld.lld does not support the NOCROSSREFS directive at the moment, which breaks the build after commit b9baf5c8c5c3 ("ARM: Spectre-BHB workaround"): ld.lld: error: ./arch/arm/kernel/vmlinux.lds:34: AT expected, but got NOCROSSREFS Support for this directive will eventually be implemented, at which point a version check can be added. To avoid breaking the build in the meantime, just define NOCROSSREFS to nothing when using ld.lld, with a link to the issue for tracking. Cc: stable@vger.kernel.org Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Link: https://github.com/ClangBuiltLinux/linux/issues/1609 Signed-off-by: Nathan Chancellor Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/kernel/vmlinux.lds.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/kernel/vmlinux.lds.h b/arch/arm/kernel/vmlinux.lds.h index 99fb6fec623a..78d156e4f008 100644 --- a/arch/arm/kernel/vmlinux.lds.h +++ b/arch/arm/kernel/vmlinux.lds.h @@ -25,6 +25,14 @@ #define ARM_MMU_DISCARD(x) x #endif +/* + * ld.lld does not support NOCROSSREFS: + * https://github.com/ClangBuiltLinux/linux/issues/1609 + */ +#ifdef CONFIG_LD_IS_LLD +#define NOCROSSREFS +#endif + /* Set start/end symbol names to the LMA for the section */ #define ARM_LMA(sym, section) \ sym##_start = LOADADDR(section); \ -- Gitee From d61ef7e2bb00d95d9380a819b14a5e756b6bed42 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Apr 2022 08:13:02 +0000 Subject: [PATCH 276/340] ARM: fix build warning in proc-v7-bugs.c stable inclusion from stable-v4.19.234 commit 623ca876d4a91665b7d948e399f5776c713722ea category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit b1a384d2cbccb1eb3f84765020d25e2c1929706e upstream. The kernel test robot discovered that building without HARDEN_BRANCH_PREDICTOR issues a warning due to a missing argument to pr_info(). Add the missing argument. Reported-by: kernel test robot Fixes: 9dd78194a372 ("ARM: report Spectre v2 status through sysfs") Signed-off-by: Russell King (Oracle) Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/mm/proc-v7-bugs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index e7557b88a995..87a93b76e148 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -110,7 +110,8 @@ static unsigned int spectre_v2_install_workaround(unsigned int method) #else static unsigned int spectre_v2_install_workaround(unsigned int method) { - pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n"); + pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n", + smp_processor_id()); return SPECTRE_VULNERABLE; } -- Gitee From f22ff10051761324031719ee2374881e896df1f6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 28 Apr 2022 08:13:03 +0000 Subject: [PATCH 277/340] ARM: Spectre-BHB: provide empty stub for non-config stable inclusion from stable-v4.19.235 commit d60f27d6ff8a8d28441b44cd2e8a2fda64c99aac category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 68453767131a5deec1e8f9ac92a9042f929e585d upstream. When CONFIG_GENERIC_CPU_VULNERABILITIES is not set, references to spectre_v2_update_state() cause a build error, so provide an empty stub for that function when the Kconfig option is not set. Fixes this build error: arm-linux-gnueabi-ld: arch/arm/mm/proc-v7-bugs.o: in function `cpu_v7_bugs_init': proc-v7-bugs.c:(.text+0x52): undefined reference to `spectre_v2_update_state' arm-linux-gnueabi-ld: proc-v7-bugs.c:(.text+0x82): undefined reference to `spectre_v2_update_state' Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Russell King Cc: Catalin Marinas Cc: linux-arm-kernel@lists.infradead.org Cc: patches@armlinux.org.uk Acked-by: Russell King (Oracle) Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/include/asm/spectre.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h index d1fa5607d3aa..85f9e538fb32 100644 --- a/arch/arm/include/asm/spectre.h +++ b/arch/arm/include/asm/spectre.h @@ -25,7 +25,13 @@ enum { SPECTRE_V2_METHOD_LOOP8 = BIT(__SPECTRE_V2_METHOD_LOOP8), }; +#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES void spectre_v2_update_state(unsigned int state, unsigned int methods); +#else +static inline void spectre_v2_update_state(unsigned int state, + unsigned int methods) +{} +#endif int spectre_bhb_update_vectors(unsigned int method); -- Gitee From f442b1f0811c0da01e8cf709c65c556575206b1d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Apr 2022 08:13:04 +0000 Subject: [PATCH 278/340] ARM: fix Thumb2 regression with Spectre BHB stable inclusion from stable-v4.19.235 commit 691f354689e94571853c79ac3d7d6e7f27dfb87b category: bugfix bugzilla: 186460, https://gitee.com/src-openeuler/kernel/issues/I53MHA CVE: CVE-2022-23960 -------------------------------- commit 6c7cb60bff7aec24b834343ff433125f469886a3 upstream. When building for Thumb2, the vectors make use of a local label. Sadly, the Spectre BHB code also uses a local label with the same number which results in the Thumb2 reference pointing at the wrong place. Fix this by changing the number used for the Spectre BHB local label. Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Tested-by: Nathan Chancellor Signed-off-by: Russell King (Oracle) Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Reviewed-by: Xiu Jianfeng Reviewed-by: Liao Chang Signed-off-by: Yongqiang Liu --- arch/arm/kernel/entry-armv.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index ffcff378e3c8..c2d36d31e908 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1076,9 +1076,9 @@ vector_bhb_loop8_\name: @ bhb workaround mov r0, #8 -1: b . + 4 +3: b . + 4 subs r0, r0, #1 - bne 1b + bne 3b dsb isb b 2b -- Gitee From f63bceddbb8f863c821c4ba2770636125b27cd45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Apr 2022 03:33:43 +0000 Subject: [PATCH 279/340] llc: fix netdevice reference leaks in llc_ui_bind() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v4.19.237 commit d14193111c436fc5de33206c67c7afd45c730099 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I51YBN CVE: CVE-2022-28356 ------------------------------------------------- commit 764f4eb6846f5475f1244767d24d25dd86528a4a upstream. Whenever llc_ui_bind() and/or llc_ui_autobind() took a reference on a netdevice but subsequently fail, they must properly release their reference or risk the infamous message from unregister_netdevice() at device dismantle. unregister_netdevice: waiting for eth0 to become free. Usage count = 3 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: 赵子轩 Reported-by: Stoyan Manolov Link: https://lore.kernel.org/r/20220323004147.1990845-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Huang Guobin Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/llc/af_llc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index fa0f3c1543ba..239dce060872 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -303,6 +303,10 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out: + if (rc) { + dev_put(llc->dev); + llc->dev = NULL; + } return rc; } @@ -401,6 +405,10 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) out_put: llc_sap_put(sap); out: + if (rc) { + dev_put(llc->dev); + llc->dev = NULL; + } release_sock(sk); return rc; } -- Gitee From ca0c52d1c8c6317e33a2c574f7ca177902c7d049 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Fri, 29 Apr 2022 03:33:44 +0000 Subject: [PATCH 280/340] netdevice: add the case if dev is NULL stable inclusion from stable-v4.19.238 commit 574a12b4445076a04499962ce8b8ae15f75a7955 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I51YBN CVE: CVE-2022-28356 ------------------------------------------------- commit b37a466837393af72fe8bcb8f1436410f3f173f3 upstream. Add the case if dev is NULL in dev_{put, hold}, so the caller doesn't need to care whether dev is NULL or not. Signed-off-by: Yajun Deng Signed-off-by: David S. Miller Cc: Pavel Machek Signed-off-by: Greg Kroah-Hartman Signed-off-by: Huang Guobin Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- include/linux/netdevice.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9a7d7e6304f4..752df39590ba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3790,7 +3790,8 @@ void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { - this_cpu_dec(*dev->pcpu_refcnt); + if (dev) + this_cpu_dec(*dev->pcpu_refcnt); } /** @@ -3801,7 +3802,8 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { - this_cpu_inc(*dev->pcpu_refcnt); + if (dev) + this_cpu_inc(*dev->pcpu_refcnt); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on -- Gitee From b694ae58e996c1e29d23a0e0e1c1e042954911cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Apr 2022 03:33:45 +0000 Subject: [PATCH 281/340] llc: only change llc->dev when bind() succeeds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v4.19.237 commit c106f9aa6cd8913af9188ad361899ae696b5de37 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I51YBN CVE: CVE-2022-28356 ------------------------------------------------- commit 2d327a79ee176930dc72c131a970c891d367c1dc upstream. My latest patch, attempting to fix the refcount leak in a minimal way turned out to add a new bug. Whenever the bind operation fails before we attempt to grab a reference count on a device, we might release the device refcount of a prior successful bind() operation. syzbot was not happy about this [1]. Note to stable teams: Make sure commit b37a46683739 ("netdevice: add the case if dev is NULL") is already present in your trees. [1] general protection fault, probably for non-canonical address 0xdffffc0000000070: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000380-0x0000000000000387] CPU: 1 PID: 3590 Comm: syz-executor361 Tainted: G W 5.17.0-syzkaller-04796-g169e77764adc #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:llc_ui_connect+0x400/0xcb0 net/llc/af_llc.c:500 Code: 80 3c 02 00 0f 85 fc 07 00 00 4c 8b a5 38 05 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d bc 24 80 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 a9 07 00 00 49 8b b4 24 80 03 00 00 4c 89 f2 48 RSP: 0018:ffffc900038cfcc0 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff8880756eb600 RCX: 0000000000000000 RDX: 0000000000000070 RSI: ffffc900038cfe3e RDI: 0000000000000380 RBP: ffff888015ee5000 R08: 0000000000000001 R09: ffff888015ee5535 R10: ffffed1002bdcaa6 R11: 0000000000000000 R12: 0000000000000000 R13: ffffc900038cfe37 R14: ffffc900038cfe38 R15: ffff888015ee5012 FS: 0000555555acd300(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000280 CR3: 0000000077db6000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __sys_connect_file+0x155/0x1a0 net/socket.c:1900 __sys_connect+0x161/0x190 net/socket.c:1917 __do_sys_connect net/socket.c:1927 [inline] __se_sys_connect net/socket.c:1924 [inline] __x64_sys_connect+0x6f/0xb0 net/socket.c:1924 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f016acb90b9 Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd417947f8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f016acb90b9 RDX: 0000000000000010 RSI: 0000000020000140 RDI: 0000000000000003 RBP: 00007f016ac7d0a0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f016ac7d130 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:llc_ui_connect+0x400/0xcb0 net/llc/af_llc.c:500 Fixes: 764f4eb6846f ("llc: fix netdevice reference leaks in llc_ui_bind()") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: 赵子轩 Cc: Stoyan Manolov Link: https://lore.kernel.org/r/20220325035827.360418-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Huang Guobin Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/llc/af_llc.c | 57 +++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 239dce060872..d23cd51efc5f 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -268,6 +268,7 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); + struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; @@ -279,14 +280,14 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { - llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); - if (llc->dev && addr->sllc_arphrd != llc->dev->type) { - dev_put(llc->dev); - llc->dev = NULL; + dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + if (dev && addr->sllc_arphrd != dev->type) { + dev_put(dev); + dev = NULL; } } else - llc->dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); - if (!llc->dev) + dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); + if (!dev) goto out; rc = -EUSERS; llc->laddr.lsap = llc_ui_autoport(); @@ -296,6 +297,11 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) sap = llc_sap_open(llc->laddr.lsap, NULL); if (!sap) goto out; + + /* Note: We do not expect errors from this point. */ + llc->dev = dev; + dev = NULL; + memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ @@ -303,10 +309,7 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out: - if (rc) { - dev_put(llc->dev); - llc->dev = NULL; - } + dev_put(dev); return rc; } @@ -329,6 +332,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); + struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; @@ -344,25 +348,26 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { - llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); - if (llc->dev) { + dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); + if (dev) { if (is_zero_ether_addr(addr->sllc_mac)) - memcpy(addr->sllc_mac, llc->dev->dev_addr, + memcpy(addr->sllc_mac, dev->dev_addr, IFHWADDRLEN); - if (addr->sllc_arphrd != llc->dev->type || + if (addr->sllc_arphrd != dev->type || !ether_addr_equal(addr->sllc_mac, - llc->dev->dev_addr)) { + dev->dev_addr)) { rc = -EINVAL; - llc->dev = NULL; + dev = NULL; } } - } else - llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, + } else { + dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); - if (llc->dev) - dev_hold(llc->dev); + } + if (dev) + dev_hold(dev); rcu_read_unlock(); - if (!llc->dev) + if (!dev) goto out; if (!addr->sllc_sap) { rc = -EUSERS; @@ -395,6 +400,11 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) goto out_put; } } + + /* Note: We do not expect errors from this point. */ + llc->dev = dev; + dev = NULL; + llc->laddr.lsap = addr->sllc_sap; memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); @@ -405,10 +415,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) out_put: llc_sap_put(sap); out: - if (rc) { - dev_put(llc->dev); - llc->dev = NULL; - } + dev_put(dev); release_sock(sk); return rc; } -- Gitee From 7883a93a82b98383e15046dca2aa3ae35bc653ec Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 29 Apr 2022 03:33:46 +0000 Subject: [PATCH 282/340] can: mcba_usb: mcba_usb_start_xmit(): fix double dev_kfree_skb in error path stable inclusion from stable-v4.19.238 commit a8bba9fd73775e66b4021b18f2193f769ce48a59 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I51YBN CVE: CVE-2022-28389 ------------------------------------------------- commit 04c9b00ba83594a29813d6b1fb8fdc93a3915174 upstream. There is no need to call dev_kfree_skb() when usb_submit_urb() fails because can_put_echo_skb() deletes original skb and can_free_echo_skb() deletes the cloned skb. Fixes: 51f3baad7de9 ("can: mcba_usb: Add support for Microchip CAN BUS Analyzer") Link: https://lore.kernel.org/all/20220311080208.45047-1-hbh25y@gmail.com Signed-off-by: Hangyu Hua Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman Signed-off-by: Huang Guobin Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/net/can/usb/mcba_usb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 1b0afeaf1a3c..25fcec5125ed 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -377,7 +377,6 @@ static netdev_tx_t mcba_usb_start_xmit(struct sk_buff *skb, xmit_failed: can_free_echo_skb(priv->netdev, ctx->ndx); mcba_usb_free_ctx(ctx); - dev_kfree_skb(skb); stats->tx_dropped++; return NETDEV_TX_OK; -- Gitee From 7b9501c21752e4993e39280273db1f8ffdbc4498 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Fri, 29 Apr 2022 03:33:47 +0000 Subject: [PATCH 283/340] hamradio: defer ax25 kfree after unregister_netdev stable inclusion from stable-v4.19.223 commit 896193a02a2981e60c40d4614fd095ce92135ccd category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I55483 CVE: CVE-2022-1195 ------------------------------------------------- commit 3e0588c291d6ce225f2b891753ca41d45ba42469 upstream. There is a possible race condition (use-after-free) like below (USE) | (FREE) ax25_sendmsg | ax25_queue_xmit | dev_queue_xmit | __dev_queue_xmit | __dev_xmit_skb | sch_direct_xmit | ... xmit_one | netdev_start_xmit | tty_ldisc_kill __netdev_start_xmit | mkiss_close ax_xmit | kfree ax_encaps | | Even though there are two synchronization primitives before the kfree: 1. wait_for_completion(&ax->dead). This can prevent the race with routines from mkiss_ioctl. However, it cannot stop the routine coming from upper layer, i.e., the ax25_sendmsg. 2. netif_stop_queue(ax->dev). It seems that this line of code aims to halt the transmit queue but it fails to stop the routine that already being xmit. This patch reorder the kfree after the unregister_netdev to avoid the possible UAF as the unregister_netdev() is well synchronized and won't return if there is a running routine. Signed-off-by: Lin Ma Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Huang Guobin Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/net/hamradio/mkiss.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 3b14e6e281d4..cc977d0cb2d9 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -803,13 +803,13 @@ static void mkiss_close(struct tty_struct *tty) */ netif_stop_queue(ax->dev); - /* Free all AX25 frame buffers. */ - kfree(ax->rbuff); - kfree(ax->xbuff); - ax->tty = NULL; unregister_netdev(ax->dev); + + /* Free all AX25 frame buffers. */ + kfree(ax->rbuff); + kfree(ax->xbuff); } /* Perform I/O control on an active ax25 channel. */ -- Gitee From c46a8e3369dfc7231551ce48cf0039d4836ad9a8 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Fri, 29 Apr 2022 03:33:48 +0000 Subject: [PATCH 284/340] hamradio: improve the incomplete fix to avoid NPD stable inclusion from stable-v4.19.223 commit b68f41c6320b2b7fbb54a95f07a69f3dc7e56c59 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I55483 CVE: CVE-2022-1195 ------------------------------------------------- commit b2f37aead1b82a770c48b5d583f35ec22aabb61e upstream. The previous commit 3e0588c291d6 ("hamradio: defer ax25 kfree after unregister_netdev") reorder the kfree operations and unregister_netdev operation to prevent UAF. This commit improves the previous one by also deferring the nullify of the ax->tty pointer. Otherwise, a NULL pointer dereference bug occurs. Partial of the stack trace is shown below. BUG: kernel NULL pointer dereference, address: 0000000000000538 RIP: 0010:ax_xmit+0x1f9/0x400 ... Call Trace: dev_hard_start_xmit+0xec/0x320 sch_direct_xmit+0xea/0x240 __qdisc_run+0x166/0x5c0 __dev_queue_xmit+0x2c7/0xaf0 ax25_std_establish_data_link+0x59/0x60 ax25_connect+0x3a0/0x500 ? security_socket_connect+0x2b/0x40 __sys_connect+0x96/0xc0 ? __hrtimer_init+0xc0/0xc0 ? common_nsleep+0x2e/0x50 ? switch_fpu_return+0x139/0x1a0 __x64_sys_connect+0x11/0x20 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The crash point is shown as below static void ax_encaps(...) { ... set_bit(TTY_DO_WRITE_WAKEUP, &ax->tty->flags); // ax->tty = NULL! ... } By placing the nullify action after the unregister_netdev, the ax->tty pointer won't be assigned as NULL net_device framework layer is well synchronized. Signed-off-by: Lin Ma Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Huang Guobin Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/net/hamradio/mkiss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index cc977d0cb2d9..a6311310dcaa 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -803,13 +803,13 @@ static void mkiss_close(struct tty_struct *tty) */ netif_stop_queue(ax->dev); - ax->tty = NULL; - unregister_netdev(ax->dev); /* Free all AX25 frame buffers. */ kfree(ax->rbuff); kfree(ax->xbuff); + + ax->tty = NULL; } /* Perform I/O control on an active ax25 channel. */ -- Gitee From 9f36ea852c9a5aeedc38a0d5ebcb1851965bf1b4 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 6 May 2022 12:14:39 +0000 Subject: [PATCH 285/340] mm: Show correct reliable pagecache size hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Show corrent reliable pagecache size if /proc/meminfo if memory reliable and pagecache use reliable memory is enabled. Fixes: 9f07eb30e624 ("mm: Add space after ReliableFileCache") Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/mem_reliable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 646c27993df4..bca569412685 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -231,7 +231,7 @@ void reliable_report_meminfo(struct seq_file *m) num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); show_val_kb(m, "FileCache: ", num); seq_printf(m, "ReliableFileCache: %8llu kB\n", - nr_pagecache_pages); + nr_pagecache_pages << (PAGE_SHIFT - 10)); } } -- Gitee From 88b64c32007e927a9bdb3be89e6a8d2b73d98e28 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 6 May 2022 12:14:40 +0000 Subject: [PATCH 286/340] mm: Fix reliable task used problem shown in meminfo hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Previous reliable task used does not contains reliable pagecache pages if pagecache use reliable memory feature is disabled. This will lead to incorrect result because reliable task's pagecache is allocated from mirrored region. With this patch, reliable task used will always contains reliable anon pages and reliable pagecache pages. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/mem_reliable.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index bca569412685..82a0486ea697 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -196,7 +196,6 @@ static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) void reliable_report_meminfo(struct seq_file *m) { - bool pagecache_enabled = pagecache_reliable_is_enabled(); s64 nr_pagecache_pages = 0; s64 nr_anon_pages = 0; long nr_buddy_pages = 0; @@ -209,8 +208,7 @@ void reliable_report_meminfo(struct seq_file *m) nr_buddy_pages += per_cpu(nr_reliable_buddy_pages, cpu); nr_anon_pages = percpu_counter_sum_positive(&anon_reliable_pages); - if (pagecache_enabled) - nr_pagecache_pages = percpu_counter_sum_positive(&pagecache_reliable_pages); + nr_pagecache_pages = percpu_counter_sum_positive(&pagecache_reliable_pages); show_val_kb(m, "ReliableTotal: ", total_reliable_mem_sz() >> PAGE_SHIFT); @@ -224,7 +222,7 @@ void reliable_report_meminfo(struct seq_file *m) percpu_counter_sum(&reliable_shmem_used_nr_page)); } - if (pagecache_enabled) { + if (pagecache_reliable_is_enabled()) { unsigned long num = 0; num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); -- Gitee From 01f7c388acbd26782b61be8c2fe8a5ede6820281 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 6 May 2022 12:14:41 +0000 Subject: [PATCH 287/340] mm: Add more debug info if oom occurs hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Add more debug info about memory reliable if oom occurs. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/mem_reliable.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 82a0486ea697..f8b8543d7933 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -490,18 +490,40 @@ static void mem_reliable_feature_disable(int idx) void reliable_show_mem_info(void) { - s64 num = 0; + s64 nr_pagecache_pages = 0; + s64 nr_anon_pages = 0; if (!mem_reliable_is_enabled()) return; - num += percpu_counter_sum_positive(&anon_reliable_pages); - num += percpu_counter_sum_positive(&pagecache_reliable_pages); + nr_anon_pages = percpu_counter_sum_positive(&anon_reliable_pages); + nr_pagecache_pages = percpu_counter_sum_positive(&pagecache_reliable_pages); + + pr_info("ReliableTotal: %lu kB\n", total_reliable_mem_sz() >> 10); + pr_info("ReliableUsed: %lu kB\n", used_reliable_mem_sz() >> 10); + pr_info("ReliableTaskLimit: %lu kB\n", task_reliable_limit >> 10); + pr_info("ReliableTaskUsed: %lld kB\n", + (nr_anon_pages + nr_pagecache_pages) << (PAGE_SHIFT - 10)); + + if (shmem_reliable_is_enabled()) { + pr_info("ReliableShmemPagesLimit: %ld\n", + shmem_reliable_nr_page); + pr_info("ReliableShmem: %llu kB\n", + percpu_counter_sum(&reliable_shmem_used_nr_page) + << (PAGE_SHIFT - 10)); + } + + if (pagecache_reliable_is_enabled()) { + unsigned long num = 0; - pr_info("ReliableTotal: %lu kB", total_reliable_mem_sz() >> 10); - pr_info("ReliableUsed: %lu kB", used_reliable_mem_sz() >> 10); - pr_info("task_reliable_limit: %lu kB", task_reliable_limit >> 10); - pr_info("reliable_user_used: %lld kB", num << (PAGE_SHIFT - 10)); + num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + pr_info("ReliableFileCacheLimit: %lu kB\n", + reliable_pagecache_max_bytes >> 10); + pr_info("FileCache: %lu kB\n", num << (PAGE_SHIFT - 10)); + pr_info("ReliableFileCache: %llu kB\n", + nr_pagecache_pages << (PAGE_SHIFT - 10)); + } } void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, -- Gitee From 74ef98cdaa031ef0539a005b2cef3de31a3ebca5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 May 2022 12:14:42 +0000 Subject: [PATCH 288/340] x86/asm: Pin sensitive CR4 bits mainline inclusion from mainline-v5.3-rc1 commit 873d50d58f67ef15d2777b5e7f7a5268bb1fbae2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I563FS CVE: NA -------------------------------- Several recent exploits have used direct calls to the native_write_cr4() function to disable SMEP and SMAP before then continuing their exploits using userspace memory access. Direct calls of this form can be mitigate by pinning bits of CR4 so that they cannot be changed through a common function. This is not intended to be a general ROP protection (which would require CFI to defend against properly), but rather a way to avoid trivial direct function calling (or CFI bypasses via a matching function prototype) as seen in: https://googleprojectzero.blogspot.com/2017/05/exploiting-linux-kernel-via-packet.html (https://github.com/xairy/kernel-exploits/tree/master/CVE-2017-7308) The goals of this change: - Pin specific bits (SMEP, SMAP, and UMIP) when writing CR4. - Avoid setting the bits too early (they must become pinned only after CPU feature detection and selection has finished). - Pinning mask needs to be read-only during normal runtime. - Pinning needs to be checked after write to validate the cr4 state Using __ro_after_init on the mask is done so it can't be first disabled with a malicious write. Since these bits are global state (once established by the boot CPU and kernel boot parameters), they are safe to write to secondary CPUs before those CPUs have finished feature detection. As such, the bits are set at the first cr4 write, so that cr4 write bugs can be detected (instead of silently papered over). This uses a few bytes less storage of a location we don't have: read-only per-CPU data. A check is performed after the register write because an attack could just skip directly to the register write. Such a direct jump is possible because of how this function may be built by the compiler (especially due to the removal of frame pointers) where it doesn't add a stack frame (function exit may only be a retq without pops) which is sufficient for trivial exploitation like in the timer overwrites mentioned above). The asm argument constraints gain the "+" modifier to convince the compiler that it shouldn't make ordering assumptions about the arguments or memory, and treat them as changed. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Dave Hansen Cc: kernel-hardening@lists.openwall.com Link: https://lkml.kernel.org/r/20190618045503.39105-3-keescook@chromium.org Signed-off-by: Lin Yujun Reviewed-by: Zhang Jianhua Signed-off-by: Yongqiang Liu --- arch/x86/include/asm/special_insns.h | 22 +++++++++++++++++++++- arch/x86/kernel/cpu/common.c | 20 ++++++++++++++++++++ arch/x86/kernel/smpboot.c | 8 +++++++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index b684dfdaaec1..ea8158e8fa4b 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -6,6 +6,8 @@ #ifdef __KERNEL__ #include +#include +#include /* * Volatile isn't enough to prevent the compiler from reordering the @@ -16,6 +18,10 @@ */ extern unsigned long __force_order; +/* Starts false and gets enabled once CPU feature detection is done. */ +DECLARE_STATIC_KEY_FALSE(cr_pinning); +extern unsigned long cr4_pinned_bits; + static inline unsigned long native_read_cr0(void) { unsigned long val; @@ -74,7 +80,21 @@ static inline unsigned long native_read_cr4(void) static inline void native_write_cr4(unsigned long val) { - asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { + bits_missing = ~val & cr4_pinned_bits; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", + bits_missing); + } } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6fe28a1b1867..a7d2fff95755 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -365,6 +365,25 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) cr4_clear_bits(X86_CR4_UMIP); } +DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); +EXPORT_SYMBOL(cr_pinning); +unsigned long cr4_pinned_bits __ro_after_init; +EXPORT_SYMBOL(cr4_pinned_bits); + +/* + * Once CPU feature detection is finished (and boot params have been + * parsed), record any of the sensitive CR bits that are set, and + * enable CR pinning. + */ +static void __init setup_cr_pinning(void) +{ + unsigned long mask; + + mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP); + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask; + static_key_enable(&cr_pinning.key); +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1588,6 +1607,7 @@ void __init identify_boot_cpu(void) #endif cpu_detect_tlb(&boot_cpu_data); tsx_init(); + setup_cr_pinning(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 08b5f534fabb..23021970136e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -212,13 +212,19 @@ static int enable_start_cpu0; */ static void notrace start_secondary(void *unused) { + unsigned long cr4 = __read_cr4(); + /* * Don't put *anything* except direct CPU state initialization * before cpu_init(), SMP booting is too fragile that we want to * limit the things done here to the most necessary things. */ if (boot_cpu_has(X86_FEATURE_PCID)) - __write_cr4(__read_cr4() | X86_CR4_PCIDE); + cr4 |= X86_CR4_PCIDE; + if (static_branch_likely(&cr_pinning)) + cr4 |= cr4_pinned_bits; + + __write_cr4(cr4); #ifdef CONFIG_X86_32 /* switch away from the initial page table */ -- Gitee From 1143d1dd32d8f010bcd678a5f945b7e21f81d373 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 May 2022 12:14:43 +0000 Subject: [PATCH 289/340] x86/asm: Pin sensitive CR0 bits mainline inclusion from mainline-v5.3-rc1 commit 8dbec27a242cd3e2816eeb98d3237b9f57cf6232 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I563FS CVE: NA -------------------------------- With sensitive CR4 bits pinned now, it's possible that the WP bit for CR0 might become a target as well. Following the same reasoning for the CR4 pinning, pin CR0's WP bit. Contrary to the cpu feature dependend CR4 pinning this can be done with a constant value. Suggested-by: Peter Zijlstra Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Dave Hansen Cc: kernel-hardening@lists.openwall.com Link: https://lkml.kernel.org/r/20190618045503.39105-4-keescook@chromium.org Signed-off-by: Lin Yujun Reviewed-by: Zhang Jianhua Signed-off-by: Yongqiang Liu --- arch/x86/include/asm/special_insns.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index ea8158e8fa4b..87c371e87743 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -31,7 +31,20 @@ static inline unsigned long native_read_cr0(void) static inline void native_write_cr0(unsigned long val) { - asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { + bits_missing = X86_CR0_WP; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); + } } static inline unsigned long native_read_cr2(void) -- Gitee From f1434c1db5ec10f3c931dcde9d71779e3f78f1dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 6 May 2022 12:14:44 +0000 Subject: [PATCH 290/340] x86/asm: Move native_write_cr0/4() out of line mainline inclusion from mainline-v5.3-rc1 commit 7652ac92018536eb807b6c2130100c85f1ba7e3b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I563FS CVE: NA -------------------------------- The pinning of sensitive CR0 and CR4 bits caused a boot crash when loading the kvm_intel module on a kernel compiled with CONFIG_PARAVIRT=n. The reason is that the static key which controls the pinning is marked RO after init. The kvm_intel module contains a CR4 write which requires to update the static key entry list. That obviously does not work when the key is in a RO section. With CONFIG_PARAVIRT enabled this does not happen because the CR4 write uses the paravirt indirection and the actual write function is built in. As the key is intended to be immutable after init, move native_write_cr0/4() out of line. While at it consolidate the update of the cr4 shadow variable and store the value right away when the pinning is initialized on a booting CPU. No point in reading it back 20 instructions later. This allows to confine the static key and the pinning variable to cpu/common and allows to mark them static. Fixes: 8dbec27a242c ("x86/asm: Pin sensitive CR0 bits") Fixes: 873d50d58f67 ("x86/asm: Pin sensitive CR4 bits") Reported-by: Linus Torvalds Reported-by: Xi Ruoyao Signed-off-by: Thomas Gleixner Tested-by: Xi Ruoyao Acked-by: Kees Cook Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907102140340.1758@nanos.tec.linutronix.de Signed-off-by: Lin Yujun Reviewed-by: Zhang Jianhua Signed-off-by: Yongqiang Liu --- arch/x86/include/asm/processor.h | 1 + arch/x86/include/asm/special_insns.h | 41 +--------------- arch/x86/kernel/cpu/common.c | 72 +++++++++++++++++++++------- arch/x86/kernel/smpboot.c | 14 +----- arch/x86/xen/smp_pv.c | 1 + 5 files changed, 61 insertions(+), 68 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e119c78581d1..307aa9454df3 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -763,6 +763,7 @@ extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); +extern void cr4_init(void); static inline unsigned long get_debugctlmsr(void) { diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 87c371e87743..af5f0d9a9951 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -18,9 +18,7 @@ */ extern unsigned long __force_order; -/* Starts false and gets enabled once CPU feature detection is done. */ -DECLARE_STATIC_KEY_FALSE(cr_pinning); -extern unsigned long cr4_pinned_bits; +void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { @@ -29,24 +27,6 @@ static inline unsigned long native_read_cr0(void) return val; } -static inline void native_write_cr0(unsigned long val) -{ - unsigned long bits_missing = 0; - -set_register: - asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); - - if (static_branch_likely(&cr_pinning)) { - if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { - bits_missing = X86_CR0_WP; - val |= bits_missing; - goto set_register; - } - /* Warn after we've set the missing bits. */ - WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); - } -} - static inline unsigned long native_read_cr2(void) { unsigned long val; @@ -91,24 +71,7 @@ static inline unsigned long native_read_cr4(void) return val; } -static inline void native_write_cr4(unsigned long val) -{ - unsigned long bits_missing = 0; - -set_register: - asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); - - if (static_branch_likely(&cr_pinning)) { - if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { - bits_missing = ~val & cr4_pinned_bits; - val |= bits_missing; - goto set_register; - } - /* Warn after we've set the missing bits. */ - WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", - bits_missing); - } -} +void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_64 static inline unsigned long native_read_cr8(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a7d2fff95755..a0306327e7ed 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -365,10 +365,62 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) cr4_clear_bits(X86_CR4_UMIP); } -DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); -EXPORT_SYMBOL(cr_pinning); -unsigned long cr4_pinned_bits __ro_after_init; -EXPORT_SYMBOL(cr4_pinned_bits); +static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); +static unsigned long cr4_pinned_bits __ro_after_init; + +void native_write_cr0(unsigned long val) +{ + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { + bits_missing = X86_CR0_WP; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); + } +} +EXPORT_SYMBOL(native_write_cr0); + +void native_write_cr4(unsigned long val) +{ + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { + bits_missing = ~val & cr4_pinned_bits; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", + bits_missing); + } +} +EXPORT_SYMBOL(native_write_cr4); + +void cr4_init(void) +{ + unsigned long cr4 = __read_cr4(); + + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4 |= X86_CR4_PCIDE; + if (static_branch_likely(&cr_pinning)) + cr4 |= cr4_pinned_bits; + + __write_cr4(cr4); + + /* Initialize cr4 shadow for this CPU. */ + this_cpu_write(cpu_tlbstate.cr4, cr4); +} /* * Once CPU feature detection is finished (and boot params have been @@ -1851,12 +1903,6 @@ void cpu_init(void) wait_for_master_cpu(cpu); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); - if (cpu) load_ucode_ap(); @@ -1956,12 +2002,6 @@ void cpu_init(void) wait_for_master_cpu(cpu); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); - show_ucode_info_early(); pr_info("Initializing CPU#%d\n", cpu); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 23021970136e..ee697fa8847d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -212,28 +212,16 @@ static int enable_start_cpu0; */ static void notrace start_secondary(void *unused) { - unsigned long cr4 = __read_cr4(); - /* * Don't put *anything* except direct CPU state initialization * before cpu_init(), SMP booting is too fragile that we want to * limit the things done here to the most necessary things. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - cr4 |= X86_CR4_PCIDE; - if (static_branch_likely(&cr_pinning)) - cr4 |= cr4_pinned_bits; - - __write_cr4(cr4); + cr4_init(); #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); __flush_tlb_all(); #endif load_current_idt(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index e3b18ad49889..32a9c2212124 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -57,6 +57,7 @@ static void cpu_bringup(void) { int cpu; + cr4_init(); cpu_init(); touch_softlockup_watchdog(); preempt_disable(); -- Gitee From d96ca30cacc3c08345ecd0c95a9d237c86a9ee77 Mon Sep 17 00:00:00 2001 From: david regan Date: Fri, 6 May 2022 12:14:45 +0000 Subject: [PATCH 291/340] mtd: rawnand: brcmnand: Fixed incorrect sub-page ECC status stable inclusion from stable-v4.19.231 commit c48771b7f7b086bca5a1094191b33c3c4324bf0b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: N/A ------------------------------------------------ [ Upstream commit 36415a7964711822e63695ea67fede63979054d9 ] The brcmnand driver contains a bug in which if a page (example 2k byte) is read from the parallel/ONFI NAND and within that page a subpage (512 byte) has correctable errors which is followed by a subpage with uncorrectable errors, the page read will return the wrong status of correctable (as opposed to the actual status of uncorrectable.) The bug is in function brcmnand_read_by_pio where there is a check for uncorrectable bits which will be preempted if a previous status for correctable bits is detected. The fix is to stop checking for bad bits only if we already have a bad bits status. Fixes: 27c5b17cd1b1 ("mtd: nand: add NAND driver "library" for Broadcom STB NAND controller") Signed-off-by: david regan Reviewed-by: Florian Fainelli Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/trinity-478e0c09-9134-40e8-8f8c-31c371225eda-1643237024774@3c-app-mailcom-lxa02 Signed-off-by: Sasha Levin Conflicts: drivers/mtd/nand/raw/brcmnand/brcmnand.c Signed-off-by: Cui GaoSheng Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/mtd/nand/raw/brcmnand/brcmnand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c index 4b90d5b380c2..6fbca4ae1113 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c @@ -1633,7 +1633,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip, mtd->oobsize / trans, host->hwcfg.sector_size_1k); - if (!ret) { + if (ret != -EBADMSG) { *err_addr = brcmnand_read_reg(ctrl, BRCMNAND_UNCORR_ADDR) | ((u64)(brcmnand_read_reg(ctrl, -- Gitee From 7959a47042a6c1ec5a97966cf3c04ee972b55348 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Sat, 7 May 2022 08:24:21 +0000 Subject: [PATCH 292/340] cifs: fix double free race when mount fails in cifs_get_root() stable inclusion from linux-4.19.233 commit 2fe0e281f7ad0a62259649764228227dd6b2561d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- [ Upstream commit 3d6cc9898efdfb062efb74dc18cfc700e082f5d5 ] When cifs_get_root() fails during cifs_smb3_do_mount() we call deactivate_locked_super() which eventually will call delayed_free() which will free the context. In this situation we should not proceed to enter the out: section in cifs_smb3_do_mount() and free the same resources a second time. [Thu Feb 10 12:59:06 2022] BUG: KASAN: use-after-free in rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] Read of size 8 at addr ffff888364f4d110 by task swapper/1/0 [Thu Feb 10 12:59:06 2022] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G OE 5.17.0-rc3+ #4 [Thu Feb 10 12:59:06 2022] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 [Thu Feb 10 12:59:06 2022] Call Trace: [Thu Feb 10 12:59:06 2022] [Thu Feb 10 12:59:06 2022] dump_stack_lvl+0x5d/0x78 [Thu Feb 10 12:59:06 2022] print_address_description.constprop.0+0x24/0x150 [Thu Feb 10 12:59:06 2022] ? rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] kasan_report.cold+0x7d/0x117 [Thu Feb 10 12:59:06 2022] ? rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] __asan_load8+0x86/0xa0 [Thu Feb 10 12:59:06 2022] rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] rcu_core+0x547/0xca0 [Thu Feb 10 12:59:06 2022] ? call_rcu+0x3c0/0x3c0 [Thu Feb 10 12:59:06 2022] ? __this_cpu_preempt_check+0x13/0x20 [Thu Feb 10 12:59:06 2022] ? lock_is_held_type+0xea/0x140 [Thu Feb 10 12:59:06 2022] rcu_core_si+0xe/0x10 [Thu Feb 10 12:59:06 2022] __do_softirq+0x1d4/0x67b [Thu Feb 10 12:59:06 2022] __irq_exit_rcu+0x100/0x150 [Thu Feb 10 12:59:06 2022] irq_exit_rcu+0xe/0x30 [Thu Feb 10 12:59:06 2022] sysvec_hyperv_stimer0+0x9d/0xc0 ... [Thu Feb 10 12:59:07 2022] Freed by task 58179: [Thu Feb 10 12:59:07 2022] kasan_save_stack+0x26/0x50 [Thu Feb 10 12:59:07 2022] kasan_set_track+0x25/0x30 [Thu Feb 10 12:59:07 2022] kasan_set_free_info+0x24/0x40 [Thu Feb 10 12:59:07 2022] ____kasan_slab_free+0x137/0x170 [Thu Feb 10 12:59:07 2022] __kasan_slab_free+0x12/0x20 [Thu Feb 10 12:59:07 2022] slab_free_freelist_hook+0xb3/0x1d0 [Thu Feb 10 12:59:07 2022] kfree+0xcd/0x520 [Thu Feb 10 12:59:07 2022] cifs_smb3_do_mount+0x149/0xbe0 [cifs] [Thu Feb 10 12:59:07 2022] smb3_get_tree+0x1a0/0x2e0 [cifs] [Thu Feb 10 12:59:07 2022] vfs_get_tree+0x52/0x140 [Thu Feb 10 12:59:07 2022] path_mount+0x635/0x10c0 [Thu Feb 10 12:59:07 2022] __x64_sys_mount+0x1bf/0x210 [Thu Feb 10 12:59:07 2022] do_syscall_64+0x5c/0xc0 [Thu Feb 10 12:59:07 2022] entry_SYSCALL_64_after_hwframe+0x44/0xae [Thu Feb 10 12:59:07 2022] Last potentially related work creation: [Thu Feb 10 12:59:07 2022] kasan_save_stack+0x26/0x50 [Thu Feb 10 12:59:07 2022] __kasan_record_aux_stack+0xb6/0xc0 [Thu Feb 10 12:59:07 2022] kasan_record_aux_stack_noalloc+0xb/0x10 [Thu Feb 10 12:59:07 2022] call_rcu+0x76/0x3c0 [Thu Feb 10 12:59:07 2022] cifs_umount+0xce/0xe0 [cifs] [Thu Feb 10 12:59:07 2022] cifs_kill_sb+0xc8/0xe0 [cifs] [Thu Feb 10 12:59:07 2022] deactivate_locked_super+0x5d/0xd0 [Thu Feb 10 12:59:07 2022] cifs_smb3_do_mount+0xab9/0xbe0 [cifs] [Thu Feb 10 12:59:07 2022] smb3_get_tree+0x1a0/0x2e0 [cifs] [Thu Feb 10 12:59:07 2022] vfs_get_tree+0x52/0x140 [Thu Feb 10 12:59:07 2022] path_mount+0x635/0x10c0 [Thu Feb 10 12:59:07 2022] __x64_sys_mount+0x1bf/0x210 [Thu Feb 10 12:59:07 2022] do_syscall_64+0x5c/0xc0 [Thu Feb 10 12:59:07 2022] entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: Shyam Prasad N Reviewed-by: Shyam Prasad N Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- fs/cifs/cifsfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index bc906fcf3f6d..baa1713d6695 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -779,6 +779,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, out_super: deactivate_locked_super(sb); + return root; out: cifs_cleanup_volume_info(volume_info); return root; -- Gitee From 52dfb991a57677751d6e108539e1bafb67370db9 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Sat, 7 May 2022 08:24:22 +0000 Subject: [PATCH 293/340] xfrm: fix MTU regression stable inclusion from linux-4.19.233 commit 20fe64c54412ca42efb9ba07f856ed8cac6e007e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit 6596a0229541270fb8d38d989f91b78838e5e9da upstream. Commit 749439bfac6e1a2932c582e2699f91d329658196 ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") breaks PMTU for xfrm. A Packet Too Big ICMPv6 message received in response to an ESP packet will prevent all further communication through the tunnel if the reported MTU minus the ESP overhead is smaller than 1280. E.g. in a case of a tunnel-mode ESP with sha256/aes the overhead is 92 bytes. Receiving a PTB with MTU of 1371 or less will result in all further packets in the tunnel dropped. A ping through the tunnel fails with "ping: sendmsg: Invalid argument". Apparently the MTU on the xfrm route is smaller than 1280 and fails the check inside ip6_setup_cork() added by 749439bf. We found this by debugging USGv6/ipv6ready failures. Failing tests are: "Phase-2 Interoperability Test Scenario IPsec" / 5.3.11 and 5.4.11 (Tunnel Mode: Fragmentation). Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") attempted to fix this but caused another regression in TCP MSS calculations and had to be reverted. The patch below fixes the situation by dropping the MTU check and instead checking for the underflows described in the 749439bf commit message. Signed-off-by: Jiri Bohac Fixes: 749439bfac6e ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/ipv6/ip6_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 251ec12517e9..fd496bda7ec7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1259,8 +1259,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (np->frag_size) mtu = np->frag_size; } - if (mtu < IPV6_MIN_MTU) - return -EINVAL; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; @@ -1320,8 +1318,6 @@ static int __ip6_append_data(struct sock *sk, fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - - sizeof(struct frag_hdr); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + @@ -1329,6 +1325,13 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; + if (mtu < fragheaderlen || + ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr)) + goto emsgsize; + + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); + /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ -- Gitee From 2d3caf2452989c9efa16f6ce6dc4b7c48d026d24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 May 2022 08:24:23 +0000 Subject: [PATCH 294/340] netfilter: fix use-after-free in __nf_register_net_hook() stable inclusion from linux-4.19.233 commit bdd8fc1b826e6f23963f5bef3f7431c6188ec954 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit 56763f12b0f02706576a088e85ef856deacc98a0 upstream. We must not dereference @new_hooks after nf_hook_mutex has been released, because other threads might have freed our allocated hooks already. BUG: KASAN: use-after-free in nf_hook_entries_get_hook_ops include/linux/netfilter.h:130 [inline] BUG: KASAN: use-after-free in hooks_validate net/netfilter/core.c:171 [inline] BUG: KASAN: use-after-free in __nf_register_net_hook+0x77a/0x820 net/netfilter/core.c:438 Read of size 2 at addr ffff88801c1a8000 by task syz-executor237/4430 CPU: 1 PID: 4430 Comm: syz-executor237 Not tainted 5.17.0-rc5-syzkaller-00306-g2293be58d6a1 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 nf_hook_entries_get_hook_ops include/linux/netfilter.h:130 [inline] hooks_validate net/netfilter/core.c:171 [inline] __nf_register_net_hook+0x77a/0x820 net/netfilter/core.c:438 nf_register_net_hook+0x114/0x170 net/netfilter/core.c:571 nf_register_net_hooks+0x59/0xc0 net/netfilter/core.c:587 nf_synproxy_ipv6_init+0x85/0xe0 net/netfilter/nf_synproxy_core.c:1218 synproxy_tg6_check+0x30d/0x560 net/ipv6/netfilter/ip6t_SYNPROXY.c:81 xt_check_target+0x26c/0x9e0 net/netfilter/x_tables.c:1038 check_target net/ipv6/netfilter/ip6_tables.c:530 [inline] find_check_entry.constprop.0+0x7f1/0x9e0 net/ipv6/netfilter/ip6_tables.c:573 translate_table+0xc8b/0x1750 net/ipv6/netfilter/ip6_tables.c:735 do_replace net/ipv6/netfilter/ip6_tables.c:1153 [inline] do_ip6t_set_ctl+0x56e/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 ipv6_setsockopt+0x122/0x180 net/ipv6/ipv6_sockglue.c:1024 rawv6_setsockopt+0xd3/0x6a0 net/ipv6/raw.c:1084 __sys_setsockopt+0x2db/0x610 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f65a1ace7d9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 71 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f65a1a7f308 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f65a1ace7d9 RDX: 0000000000000040 RSI: 0000000000000029 RDI: 0000000000000003 RBP: 00007f65a1b574c8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000020000000 R11: 0000000000000246 R12: 00007f65a1b55130 R13: 00007f65a1b574c0 R14: 00007f65a1b24090 R15: 0000000000022000 The buggy address belongs to the page: page:ffffea0000706a00 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1c1a8 flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000000 ffffea0001c1b108 ffffea000046dd08 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as freed page last allocated via order 2, migratetype Unmovable, gfp_mask 0x52dc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_ZERO), pid 4430, ts 1061781545818, free_ts 1061791488993 prep_new_page mm/page_alloc.c:2434 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389 __alloc_pages_node include/linux/gfp.h:572 [inline] alloc_pages_node include/linux/gfp.h:595 [inline] kmalloc_large_node+0x62/0x130 mm/slub.c:4438 __kmalloc_node+0x35a/0x4a0 mm/slub.c:4454 kmalloc_node include/linux/slab.h:604 [inline] kvmalloc_node+0x97/0x100 mm/util.c:580 kvmalloc include/linux/slab.h:731 [inline] kvzalloc include/linux/slab.h:739 [inline] allocate_hook_entries_size net/netfilter/core.c:61 [inline] nf_hook_entries_grow+0x140/0x780 net/netfilter/core.c:128 __nf_register_net_hook+0x144/0x820 net/netfilter/core.c:429 nf_register_net_hook+0x114/0x170 net/netfilter/core.c:571 nf_register_net_hooks+0x59/0xc0 net/netfilter/core.c:587 nf_synproxy_ipv6_init+0x85/0xe0 net/netfilter/nf_synproxy_core.c:1218 synproxy_tg6_check+0x30d/0x560 net/ipv6/netfilter/ip6t_SYNPROXY.c:81 xt_check_target+0x26c/0x9e0 net/netfilter/x_tables.c:1038 check_target net/ipv6/netfilter/ip6_tables.c:530 [inline] find_check_entry.constprop.0+0x7f1/0x9e0 net/ipv6/netfilter/ip6_tables.c:573 translate_table+0xc8b/0x1750 net/ipv6/netfilter/ip6_tables.c:735 do_replace net/ipv6/netfilter/ip6_tables.c:1153 [inline] do_ip6t_set_ctl+0x56e/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1352 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404 free_unref_page_prepare mm/page_alloc.c:3325 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3404 kvfree+0x42/0x50 mm/util.c:613 rcu_do_batch kernel/rcu/tree.c:2527 [inline] rcu_core+0x7b1/0x1820 kernel/rcu/tree.c:2778 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Memory state around the buggy address: ffff88801c1a7f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88801c1a7f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88801c1a8000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff88801c1a8080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88801c1a8100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Fixes: 2420b79f8c18 ("netfilter: debug: check for sorted array") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/netfilter/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 3f0bdc728f89..ada3163b3fa3 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -335,14 +335,15 @@ static int __nf_register_net_hook(struct net *net, int pf, p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); - if (!IS_ERR(new_hooks)) + if (!IS_ERR(new_hooks)) { + hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); + } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); - hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_inc_ingress_queue(); -- Gitee From 8db3cf08ad706f3c2a5d37ddcaa069b1d65da584 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Sat, 7 May 2022 08:24:24 +0000 Subject: [PATCH 295/340] xfrm: fix the if_id check in changelink stable inclusion from linux-4.19.233 commit 33c4a43021acfb91e289b2d92e2db0f4a8fcf41e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit 6d0d95a1c2b07270870e7be16575c513c29af3f1 upstream. if_id will be always 0, because it was not yet initialized. Fixes: 8dce43919566 ("xfrm: interface with if_id 0 should return error") Reported-by: Pavel Machek Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/xfrm/xfrm_interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 6bfd7b8249da..18d27ce26600 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -698,12 +698,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct net *net = xi->net; struct xfrm_if_parms p = {}; + xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } - xfrmi_netlink_parms(data, &p); xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); -- Gitee From 48c4b4b58ff6bea73210f19e6de25b0e75fbf702 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 7 May 2022 08:24:25 +0000 Subject: [PATCH 296/340] xfrm: enforce validity of offload input flags stable inclusion from linux-4.19.233 commit 94c115d1380400f0001640619244dfe7a3a04772 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit 7c76ecd9c99b6e9a771d813ab1aa7fa428b3ade1 upstream. struct xfrm_user_offload has flags variable that received user input, but kernel didn't check if valid bits were provided. It caused a situation where not sanitized input was forwarded directly to the drivers. For example, XFRM_OFFLOAD_IPV6 define that was exposed, was used by strongswan, but not implemented in the kernel at all. As a solution, check and sanitize input flags to forward XFRM_OFFLOAD_INBOUND to the drivers. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- include/uapi/linux/xfrm.h | 6 ++++++ net/xfrm/xfrm_device.c | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 5f3b9fec7b5f..57d7d5a99c9d 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -504,6 +504,12 @@ struct xfrm_user_offload { int ifindex; __u8 flags; }; +/* This flag was exposed without any kernel code that supporting it. + * Unfortunately, strongswan has the code that uses sets this flag, + * which makes impossible to reuse this bit. + * + * So leave it here to make sure that it won't be reused by mistake. + */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index e7a0ce98479f..8a9f02997067 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -153,6 +153,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (x->encap || x->tfcpad) return -EINVAL; + if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) + return -EINVAL; + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -190,7 +193,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->dev = dev; xso->num_exthdrs = 1; - xso->flags = xuo->flags; + /* Don't forward bit that is not implemented */ + xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { -- Gitee From 7078f0dd20c2d448c6af3c2451eefb0b84070e25 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 7 May 2022 08:24:26 +0000 Subject: [PATCH 297/340] netfilter: nf_queue: don't assume sk is full socket stable inclusion from linux-4.19.233 commit 6d9719ef61a99d1ed750b510d32620a6c4bb7397 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit 747670fd9a2d1b7774030dba65ca022ba442ce71 upstream. There is no guarantee that state->sk refers to a full socket. If refcount transitions to 0, sock_put calls sk_free which then ends up with garbage fields. I'd like to thank Oleksandr Natalenko and Jiri Benc for considerable debug work and pointing out state->sk oddities. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Tested-by: Oleksandr Natalenko Signed-off-by: Florian Westphal Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/netfilter/nf_queue.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index ee6d98355081..ca0c9a9dcdc8 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,6 +46,15 @@ void nf_unregister_queue_handler(struct net *net) } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_sock_put(struct sock *sk) +{ +#ifdef CONFIG_INET + sock_gen_put(sk); +#else + sock_put(sk); +#endif +} + void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -56,7 +65,7 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) if (state->out) dev_put(state->out); if (state->sk) - sock_put(state->sk); + nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { struct net_device *physdev; -- Gitee From 856bec9c04cc6ad11c5bbcace8ff6fe1bac5412e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 7 May 2022 08:24:27 +0000 Subject: [PATCH 298/340] netfilter: nf_queue: fix possible use-after-free stable inclusion from linux-4.19.233 commit 34dc4a6a7f261736ef7183868a5bddad31c7f9e3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit c3873070247d9e3c7a6b0cf9bf9b45e8018427b1 upstream. Eric Dumazet says: The sock_hold() side seems suspect, because there is no guarantee that sk_refcnt is not already 0. On failure, we cannot queue the packet and need to indicate an error. The packet will be dropped by the caller. v2: split skb prefetch hunk into separate change Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.") Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- include/net/netfilter/nf_queue.h | 2 +- net/netfilter/nf_queue.c | 11 +++++++++-- net/netfilter/nfnetlink_queue.c | 12 +++++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index a50a69f5334c..861414a62ce1 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -32,7 +32,7 @@ void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *q void nf_unregister_queue_handler(struct net *net); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_release_refs(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index ca0c9a9dcdc8..84c59de27882 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -82,10 +82,13 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); /* Bump dev refs so they don't vanish while packet is out */ -void nf_queue_entry_get_refs(struct nf_queue_entry *entry) +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; + if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) + return false; + if (state->in) dev_hold(state->in); if (state->out) @@ -104,6 +107,7 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) dev_hold(physdev); } #endif + return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -195,7 +199,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, .size = sizeof(*entry) + route_key_size, }; - nf_queue_entry_get_refs(entry); + if (!nf_queue_entry_get_refs(entry)) { + kfree(entry); + return -ENOTCONN; + } switch (entry->state.pf) { case AF_INET: diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8955431f2ab2..a5aff2834bd6 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -716,9 +716,15 @@ static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) - nf_queue_entry_get_refs(entry); - return entry; + + if (!entry) + return NULL; + + if (nf_queue_entry_get_refs(entry)) + return entry; + + kfree(entry); + return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -- Gitee From e044091614ea835e89ae4e954c2b4538b3e05837 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sat, 7 May 2022 08:24:28 +0000 Subject: [PATCH 299/340] PCI: pciehp: Fix infinite loop in IRQ handler upon power fault stable inclusion from linux-4.19.233 commit ff27f7d0333cff89ec85c419f431aca1b38fb16a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit 23584c1ed3e15a6f4bfab8dc5a88d94ab929ee12 upstream. The Power Fault Detected bit in the Slot Status register differs from all other hotplug events in that it is sticky: It can only be cleared after turning off slot power. Per PCIe r5.0, sec. 6.7.1.8: If a power controller detects a main power fault on the hot-plug slot, it must automatically set its internal main power fault latch [...]. The main power fault latch is cleared when software turns off power to the hot-plug slot. The stickiness used to cause interrupt storms and infinite loops which were fixed in 2009 by commits 5651c48cfafe ("PCI pciehp: fix power fault interrupt storm problem") and 99f0169c17f3 ("PCI: pciehp: enable software notification on empty slots"). Unfortunately in 2020 the infinite loop issue was inadvertently reintroduced by commit 8edf5332c393 ("PCI: pciehp: Fix MSI interrupt race"): The hardirq handler pciehp_isr() clears the PFD bit until pciehp's power_fault_detected flag is set. That happens in the IRQ thread pciehp_ist(), which never learns of the event because the hardirq handler is stuck in an infinite loop. Fix by setting the power_fault_detected flag already in the hardirq handler. Link: https://bugzilla.kernel.org/show_bug.cgi?id=214989 Link: https://lore.kernel.org/linux-pci/DM8PR11MB5702255A6A92F735D90A4446868B9@DM8PR11MB5702.namprd11.prod.outlook.com Fixes: 8edf5332c393 ("PCI: pciehp: Fix MSI interrupt race") Link: https://lore.kernel.org/r/66eaeef31d4997ceea357ad93259f290ededecfd.1637187226.git.lukas@wunner.de Reported-by: Joseph Bao Tested-by: Joseph Bao Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v4.19+ Cc: Stuart Hayes [sudip: adjust context] Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- drivers/pci/hotplug/pciehp_hpc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index c49d1fc86cc1..ac552a85a9b6 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -609,6 +609,8 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) */ if (ctrl->power_fault_detected) status &= ~PCI_EXP_SLTSTA_PFD; + else if (status & PCI_EXP_SLTSTA_PFD) + ctrl->power_fault_detected = true; events |= status; if (!events) { @@ -618,7 +620,7 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) } if (status) { - pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, events); + pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, status); /* * In MSI mode, all event bits must be zero before the port @@ -706,8 +708,7 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) } /* Check Power Fault Detected */ - if ((events & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) { - ctrl->power_fault_detected = 1; + if (events & PCI_EXP_SLTSTA_PFD) { ctrl_err(ctrl, "Slot(%s): Power fault\n", slot_name(slot)); pciehp_set_attention_status(slot, 1); pciehp_green_led_off(slot); -- Gitee From 3322f20f332cef0d4d5d3478047718e77e2b8c08 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 7 May 2022 08:24:29 +0000 Subject: [PATCH 300/340] memfd: fix F_SEAL_WRITE after shmem huge page allocated stable inclusion from linux-4.19.233 commit 463c2accd61fe8c664dfcb9d9de50f632363ad28 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit f2b277c4d1c63a85127e8aa2588e9cc3bd21cb99 upstream. Wangyong reports: after enabling tmpfs filesystem to support transparent hugepage with the following command: echo always > /sys/kernel/mm/transparent_hugepage/shmem_enabled the docker program tries to add F_SEAL_WRITE through the following command, but it fails unexpectedly with errno EBUSY: fcntl(5, F_ADD_SEALS, F_SEAL_WRITE) = -1. That is because memfd_tag_pins() and memfd_wait_for_pins() were never updated for shmem huge pages: checking page_mapcount() against page_count() is hopeless on THP subpages - they need to check total_mapcount() against page_count() on THP heads only. Make memfd_tag_pins() (compared > 1) as strict as memfd_wait_for_pins() (compared != 1): either can be justified, but given the non-atomic total_mapcount() calculation, it is better now to be strict. Bear in mind that total_mapcount() itself scans all of the THP subpages, when choosing to take an XA_CHECK_SCHED latency break. Also fix the unlikely xa_is_value() case in memfd_wait_for_pins(): if a page has been swapped out since memfd_tag_pins(), then its refcount must have fallen, and so it can safely be untagged. Link: https://lkml.kernel.org/r/a4f79248-df75-2c8c-3df-ba3317ccb5da@google.com Signed-off-by: Hugh Dickins Reported-by: Zeal Robot Reported-by: wangyong Cc: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: CGEL ZTE Cc: Kirill A. Shutemov Cc: Song Liu Cc: Yang Yang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- mm/memfd.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 9e68a4320a0e..2d19288a093e 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -34,26 +34,35 @@ static void memfd_tag_pins(struct address_space *mapping) void __rcu **slot; pgoff_t start; struct page *page; - unsigned int tagged = 0; + int latency = 0; + int cache_count; lru_add_drain(); start = 0; xa_lock_irq(&mapping->i_pages); radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + cache_count = 1; page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - if (!page || radix_tree_exception(page)) { + if (!page || radix_tree_exception(page) || PageTail(page)) { if (radix_tree_deref_retry(page)) { slot = radix_tree_iter_retry(&iter); continue; } - } else if (page_count(page) - page_mapcount(page) > 1) { - radix_tree_tag_set(&mapping->i_pages, iter.index, - MEMFD_TAG_PINNED); + } else { + if (PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + if (cache_count != + page_count(page) - total_mapcount(page)) { + radix_tree_tag_set(&mapping->i_pages, + iter.index, MEMFD_TAG_PINNED); + } } - if (++tagged % 1024) + latency += cache_count; + if (latency < 1024) continue; + latency = 0; slot = radix_tree_iter_resume(slot, &iter); xa_unlock_irq(&mapping->i_pages); @@ -79,6 +88,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) pgoff_t start; struct page *page; int error, scan; + int cache_count; memfd_tag_pins(mapping); @@ -107,8 +117,12 @@ static int memfd_wait_for_pins(struct address_space *mapping) page = NULL; } - if (page && - page_count(page) - page_mapcount(page) != 1) { + cache_count = 1; + if (page && PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (page && cache_count != + page_count(page) - total_mapcount(page)) { if (scan < LAST_SCAN) goto continue_resched; -- Gitee From 7be896bfa0f4688239f4f85cf9913cfda1b537bc Mon Sep 17 00:00:00 2001 From: suresh kumar Date: Sat, 7 May 2022 08:24:30 +0000 Subject: [PATCH 301/340] net-sysfs: add check for netdevice being present to speed_show stable inclusion from linux-4.19.235 commit 75fc8363227a999e8f3d17e2eb28dce5600dcd3f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- [ Upstream commit 4224cfd7fb6523f7a9d1c8bb91bb5df1e38eb624 ] When bringing down the netdevice or system shutdown, a panic can be triggered while accessing the sysfs path because the device is already removed. [ 755.549084] mlx5_core 0000:12:00.1: Shutdown was called [ 756.404455] mlx5_core 0000:12:00.0: Shutdown was called ... [ 757.937260] BUG: unable to handle kernel NULL pointer dereference at (null) [ 758.031397] IP: [] dma_pool_alloc+0x1ab/0x280 crash> bt ... PID: 12649 TASK: ffff8924108f2100 CPU: 1 COMMAND: "amsd" ... #9 [ffff89240e1a38b0] page_fault at ffffffff8f38c778 [exception RIP: dma_pool_alloc+0x1ab] RIP: ffffffff8ee11acb RSP: ffff89240e1a3968 RFLAGS: 00010046 RAX: 0000000000000246 RBX: ffff89243d874100 RCX: 0000000000001000 RDX: 0000000000000000 RSI: 0000000000000246 RDI: ffff89243d874090 RBP: ffff89240e1a39c0 R8: 000000000001f080 R9: ffff8905ffc03c00 R10: ffffffffc04680d4 R11: ffffffff8edde9fd R12: 00000000000080d0 R13: ffff89243d874090 R14: ffff89243d874080 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #10 [ffff89240e1a39c8] mlx5_alloc_cmd_msg at ffffffffc04680f3 [mlx5_core] #11 [ffff89240e1a3a18] cmd_exec at ffffffffc046ad62 [mlx5_core] #12 [ffff89240e1a3ab8] mlx5_cmd_exec at ffffffffc046b4fb [mlx5_core] #13 [ffff89240e1a3ae8] mlx5_core_access_reg at ffffffffc0475434 [mlx5_core] #14 [ffff89240e1a3b40] mlx5e_get_fec_caps at ffffffffc04a7348 [mlx5_core] #15 [ffff89240e1a3bb0] get_fec_supported_advertised at ffffffffc04992bf [mlx5_core] #16 [ffff89240e1a3c08] mlx5e_get_link_ksettings at ffffffffc049ab36 [mlx5_core] #17 [ffff89240e1a3ce8] __ethtool_get_link_ksettings at ffffffff8f25db46 #18 [ffff89240e1a3d48] speed_show at ffffffff8f277208 #19 [ffff89240e1a3dd8] dev_attr_show at ffffffff8f0b70e3 #20 [ffff89240e1a3df8] sysfs_kf_seq_show at ffffffff8eedbedf #21 [ffff89240e1a3e18] kernfs_seq_show at ffffffff8eeda596 #22 [ffff89240e1a3e28] seq_read at ffffffff8ee76d10 #23 [ffff89240e1a3e98] kernfs_fop_read at ffffffff8eedaef5 #24 [ffff89240e1a3ed8] vfs_read at ffffffff8ee4e3ff #25 [ffff89240e1a3f08] sys_read at ffffffff8ee4f27f #26 [ffff89240e1a3f50] system_call_fastpath at ffffffff8f395f92 crash> net_device.state ffff89443b0c0000 state = 0x5 (__LINK_STATE_START| __LINK_STATE_NOCARRIER) To prevent this scenario, we also make sure that the netdevice is present. Signed-off-by: suresh kumar Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/core/net-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e5dc04cb5599..7a11b2d90975 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -203,7 +203,7 @@ static ssize_t speed_show(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev)) { + if (netif_running(netdev) && netif_device_present(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) -- Gitee From fb435f3bc51428a9e78694770ba3c9818d8ba3a3 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sat, 7 May 2022 08:24:31 +0000 Subject: [PATCH 302/340] ext4: add check to prevent attempting to resize an fs with sparse_super2 stable inclusion from linux-4.19.235 commit 056d829499d3e9b5bb2c4e9a1d96d4e4f77b5ae2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit b1489186cc8391e0c1e342f9fbc3eedf6b944c61 upstream. The in-kernel ext4 resize code doesn't support filesystem with the sparse_super2 feature. It fails with errors like this and doesn't finish the resize: EXT4-fs (loop0): resizing filesystem from 16640 to 7864320 blocks EXT4-fs warning (device loop0): verify_reserved_gdb:760: reserved GDT 2 missing grp 1 (32770) EXT4-fs warning (device loop0): ext4_resize_fs:2111: error (-22) occurred during file system resize EXT4-fs (loop0): resized filesystem to 2097152 To reproduce: mkfs.ext4 -b 4096 -I 256 -J size=32 -E resize=$((256*1024*1024)) -O sparse_super2 ext4.img 65M truncate -s 30G ext4.img mount ext4.img /mnt python3 -c 'import fcntl, os, struct ; fd = os.open("/mnt", os.O_RDONLY | os.O_DIRECTORY) ; fcntl.ioctl(fd, 0x40086610, struct.pack("Q", 30 * 1024 * 1024 * 1024 // 4096), False) ; os.close(fd)' dmesg | tail e2fsck ext4.img The userspace resize2fs tool has a check for this case: it checks if the filesystem has sparse_super2 set and if the kernel provides /sys/fs/ext4/features/sparse_super2. However, the former check requires manually reading and parsing the filesystem superblock. Detect this case in ext4_resize_begin and error out early with a clear error message. Signed-off-by: Josh Triplett Link: https://lore.kernel.org/r/74b8ae78405270211943cd7393e65586c5faeed1.1623093259.git.josh@joshtriplett.org Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- fs/ext4/resize.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 08d67f79aed7..7441e28d2706 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -74,6 +74,11 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; } + if (ext4_has_feature_sparse_super2(sb)) { + ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); + return -EOPNOTSUPP; + } + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags)) ret = -EBUSY; -- Gitee From 8fca9e9d82611f9d1d85d63a8685038b79d1a1a8 Mon Sep 17 00:00:00 2001 From: Kai Lueke Date: Sat, 7 May 2022 08:24:32 +0000 Subject: [PATCH 303/340] Revert "xfrm: state and policy should fail if XFRMA_IF_ID 0" stable inclusion from linux-4.19.236 commit c8c9220cc0fb0dcdcce140533cc46128bd836347 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit a3d9001b4e287fc043e5539d03d71a32ab114bcb upstream. This reverts commit 68ac0f3810e76a853b5f7b90601a05c3048b8b54 because ID 0 was meant to be used for configuring the policy/state without matching for a specific interface (e.g., Cilium is affected, see https://github.com/cilium/cilium/pull/18789 and https://github.com/cilium/cilium/pull/19019). Signed-off-by: Kai Lueke Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/xfrm/xfrm_user.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8d8f9e778cd4..87932f6ad9d7 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -620,13 +620,8 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!x->if_id) { - err = -EINVAL; - goto error; - } - } err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -1332,13 +1327,8 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, mark = xfrm_mark_get(attrs, &m); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!if_id) { - err = -EINVAL; - goto out_noput; - } - } if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); @@ -1640,13 +1630,8 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!xp->if_id) { - err = -EINVAL; - goto error; - } - } return xp; error: -- Gitee From 504cf1ffe1b3023fff5554e92b671655d6ed9c85 Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Sat, 7 May 2022 08:24:33 +0000 Subject: [PATCH 304/340] xfrm: Fix xfrm migrate issues when address family changes stable inclusion from linux-4.19.236 commit f82fdf0230c3ea639306f35daaac7fca11a48a7a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- [ Upstream commit e03c3bba351f99ad932e8f06baa9da1afc418e02 ] xfrm_migrate cannot handle address family change of an xfrm_state. The symptons are the xfrm_state will be migrated to a wrong address, and sending as well as receiving packets wil be broken. This commit fixes it by breaking the original xfrm_state_clone method into two steps so as to update the props.family before running xfrm_init_state. As the result, xfrm_state's inner mode, outer mode, type and IP header length in xfrm_state_migrate can be updated with the new address family. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1885354 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/xfrm/xfrm_state.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 84dea0ad1666..bc7e982a86d1 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1443,9 +1443,6 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, memcpy(&x->mark, &orig->mark, sizeof(x->mark)); memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); - if (xfrm_init_state(x) < 0) - goto error; - x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; @@ -1524,6 +1521,11 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, if (!xc) return NULL; + xc->props.family = m->new_family; + + if (xfrm_init_state(xc) < 0) + goto error; + memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); -- Gitee From 5d2ba004b3817cb09f9168331752b448dad89af1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 May 2022 08:24:34 +0000 Subject: [PATCH 305/340] tcp: make tcp_read_sock() more robust stable inclusion from linux-4.19.236 commit 9c90c0c62b5b5878f32400338983878a5345b050 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- [ Upstream commit e3d5ea2c011ecb16fb94c56a659364e6b30fac94 ] If recv_actor() returns an incorrect value, tcp_read_sock() might loop forever. Instead, issue a one time warning and make sure to make progress. Signed-off-by: Eric Dumazet Acked-by: John Fastabend Acked-by: Jakub Sitnicki Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220302161723.3910001-2-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5b410171a20b..6d955ee09d1c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1674,11 +1674,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (!copied) copied = used; break; - } else if (used <= len) { - seq += used; - copied += used; - offset += used; } + if (WARN_ON_ONCE(used > len)) + used = len; + seq += used; + copied += used; + offset += used; + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it -- Gitee From 9fcc65d0f0b0e35e34e432f9a43756040523035c Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Sat, 7 May 2022 08:24:35 +0000 Subject: [PATCH 306/340] cpuset: Fix unsafe lock order between cpuset lock and cpuslock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from linux-4.19.236 commit aa44002e7db25f333ddf412fb81e8db6c100841a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- The backport commit 4eec5fe1c680a ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug") looks suspicious since it comes before commit d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order") v5.4-rc1~176^2~30 when the locking order was: cpuset lock, cpus lock. Fix it with the correct locking order and reduce the cpus locking range because only set_cpus_allowed_ptr() needs the protection of cpus lock. Fixes: 4eec5fe1c680a ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug") Reported-by: Michal Koutný Signed-off-by: Zhang Qiao Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- kernel/cgroup/cpuset.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 27dd33566df8..3544ee391350 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1530,9 +1530,13 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - cpus_read_lock(); mutex_lock(&cpuset_mutex); + /* + * It should hold cpus lock because a cpu offline event can + * cause set_cpus_allowed_ptr() failed. + */ + get_online_cpus(); /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1551,6 +1555,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); } + put_online_cpus(); /* * Change mm for all threadgroup leaders. This is expensive and may @@ -1586,7 +1591,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ -- Gitee From 0d845d504d68ed6e60bc353cccedc4facedd56c3 Mon Sep 17 00:00:00 2001 From: liqiong Date: Sat, 7 May 2022 08:24:36 +0000 Subject: [PATCH 307/340] mm: fix dereference a null pointer in migrate[_huge]_page_move_mapping() stable inclusion from linux-4.19.236 commit deebce9df9ffaa62613bfcd8351d0c43a9a66108 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- Upstream doesn't use radix tree any more in migrate.c, no need this patch. The two functions look up a slot and dereference the pointer, If the pointer is null, the kernel would crash and dump. The 'numad' service calls 'migrate_pages' periodically. If some slots being replaced (Cache Eviction), the radix_tree_lookup_slot() returns a null pointer that causes kernel crash. "numad": crash> bt [exception RIP: migrate_page_move_mapping+337] Introduce pointer checking to avoid dereference a null pointer. Cc: # linux-4.19.y Signed-off-by: liqiong Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- mm/migrate.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index ecfa8829acfd..3a20900e19e0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -473,6 +473,10 @@ int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + if (pslot == NULL) { + xa_unlock_irq(&mapping->i_pages); + return -EAGAIN; + } expected_count += hpage_nr_pages(page) + page_has_private(page); if (page_count(page) != expected_count || @@ -596,6 +600,10 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, xa_lock_irq(&mapping->i_pages); pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + if (pslot == NULL) { + xa_unlock_irq(&mapping->i_pages); + return -EAGAIN; + } expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || -- Gitee From 4c3790eb3c7e9f0fcc9a3a29d14afaf50bead9d7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 May 2022 08:24:37 +0000 Subject: [PATCH 308/340] net/packet: fix slab-out-of-bounds access in packet_recvmsg() stable inclusion from linux-4.19.236 commit a33dd1e6693f80d805155b3f69c18c2f642915da category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- [ Upstream commit c700525fcc06b05adfea78039de02628af79e07a ] syzbot found that when an AF_PACKET socket is using PACKET_COPY_THRESH and mmap operations, tpacket_rcv() is queueing skbs with garbage in skb->cb[], triggering a too big copy [1] Presumably, users of af_packet using mmap() already gets correct metadata from the mapped buffer, we can simply make sure to clear 12 bytes that might be copied to user space later. BUG: KASAN: stack-out-of-bounds in memcpy include/linux/fortify-string.h:225 [inline] BUG: KASAN: stack-out-of-bounds in packet_recvmsg+0x56c/0x1150 net/packet/af_packet.c:3489 Write of size 165 at addr ffffc9000385fb78 by task syz-executor233/3631 CPU: 0 PID: 3631 Comm: syz-executor233 Not tainted 5.17.0-rc7-syzkaller-02396-g0b3660695e80 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xf/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x13d/0x180 mm/kasan/generic.c:189 memcpy+0x39/0x60 mm/kasan/shadow.c:66 memcpy include/linux/fortify-string.h:225 [inline] packet_recvmsg+0x56c/0x1150 net/packet/af_packet.c:3489 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_recvmsg net/socket.c:962 [inline] ____sys_recvmsg+0x2c4/0x600 net/socket.c:2632 ___sys_recvmsg+0x127/0x200 net/socket.c:2674 __sys_recvmsg+0xe2/0x1a0 net/socket.c:2704 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fdfd5954c29 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 41 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffcf8e71e48 EFLAGS: 00000246 ORIG_RAX: 000000000000002f RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fdfd5954c29 RDX: 0000000000000000 RSI: 0000000020000500 RDI: 0000000000000005 RBP: 0000000000000000 R08: 000000000000000d R09: 000000000000000d R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffcf8e71e60 R13: 00000000000f4240 R14: 000000000000c1ff R15: 00007ffcf8e71e54 addr ffffc9000385fb78 is located in stack of task syz-executor233/3631 at offset 32 in frame: ____sys_recvmsg+0x0/0x600 include/linux/uio.h:246 this frame has 1 object: [32, 160) 'addr' Memory state around the buggy address: ffffc9000385fa80: 00 04 f3 f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 ffffc9000385fb00: 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 >ffffc9000385fb80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f3 ^ ffffc9000385fc00: f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 f1 ffffc9000385fc80: f1 f1 f1 00 f2 f2 f2 00 f2 f2 f2 00 00 00 00 00 ================================================================== Fixes: 0fb375fb9b93 ("[AF_PACKET]: Allow for > 8 byte hardware addresses.") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220312232958.3535620-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/packet/af_packet.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index bacb1bbe53e2..ddf0d081b5bc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2241,8 +2241,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, copy_skb = skb_get(skb); skb_head = skb->data; } - if (copy_skb) + if (copy_skb) { + memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, + sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); + } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { @@ -3400,6 +3403,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { + const size_t max_len = min(sizeof(skb->cb), + sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled @@ -3422,6 +3427,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(struct sockaddr_ll); } } + if (WARN_ON_ONCE(copy_len > max_len)) { + copy_len = max_len; + msg->msg_namelen = copy_len; + } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } -- Gitee From a54e6bee81f61e53a3e957d6d8e6d98fdfcecc77 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Sat, 7 May 2022 08:24:38 +0000 Subject: [PATCH 309/340] net: handle ARPHRD_PIMREG in dev_is_mac_header_xmit() stable inclusion from linux-4.19.236 commit 274076251d72ab5b2de99e3a3f0b577cd251fcc2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- [ Upstream commit 4ee06de7729d795773145692e246a06448b1eb7a ] This kind of interface doesn't have a mac header. This patch fixes bpf_redirect() to a PIM interface. Fixes: 27b29f63058d ("bpf: add bpf_redirect() helper") Signed-off-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20220315092008.31423-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- include/linux/if_arp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h index e44746de95cd..c697a0524273 100644 --- a/include/linux/if_arp.h +++ b/include/linux/if_arp.h @@ -55,6 +55,7 @@ static inline bool dev_is_mac_header_xmit(const struct net_device *dev) case ARPHRD_VOID: case ARPHRD_NONE: case ARPHRD_RAWIP: + case ARPHRD_PIMREG: return false; default: return true; -- Gitee From 9617c028309bc741f7ed98a6ef2408ee8093082a Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Sat, 7 May 2022 08:24:39 +0000 Subject: [PATCH 310/340] net: ipv6: fix skb_over_panic in __ip6_append_data stable inclusion from linux-4.19.237 commit 616e2dffaba372cf20b01172a73013ed28cbcc84 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5646A CVE: NA -------------------------------- commit 5e34af4142ffe68f01c8a9acae83300f8911e20c upstream. Syzbot found a kernel bug in the ipv6 stack: LINK: https://syzkaller.appspot.com/bug?id=205d6f11d72329ab8d62a610c44c5e7e25415580 The reproducer triggers it by sending a crafted message via sendmmsg() call, which triggers skb_over_panic, and crashes the kernel: skbuff: skb_over_panic: text:ffffffff84647fb4 len:65575 put:65575 head:ffff888109ff0000 data:ffff888109ff0088 tail:0x100af end:0xfec0 dev: Update the check that prevents an invalid packet with MTU equal to the fregment header size to eat up all the space for payload. The reproducer can be found here: LINK: https://syzkaller.appspot.com/text?tag=ReproC&x=1648c83fb00000 Reported-by: syzbot+e223cf47ec8ae183f2a0@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/20220310232538.1044947-1-tadeusz.struk@linaro.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/ipv6/ip6_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fd496bda7ec7..373df391dc93 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1325,8 +1325,8 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; - if (mtu < fragheaderlen || - ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr)) + if (mtu <= fragheaderlen || + ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) goto emsgsize; maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - -- Gitee From 9f38975317cf08b00f4fffabc07de87f3ef7b8e1 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Sat, 7 May 2022 08:24:40 +0000 Subject: [PATCH 311/340] share_pool: Fix ABBA deadlock hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I54I7W CVE: NA There is a ABBA deadlock in the process of sp_group_add_task and proc_stat_show(). PROCESS A: sp_group_add_task() acquire sp_group_sem write lock ->sp_init_proc_stat() acquire sp_spg_stat_sem write lock PROCESS B: proc_stat_show() acquire sp_spg_stat_sem read lock ->idr_proc_stat_cb() acquire sp_group_sem read lock Here we choose the simplest way that acquires sp_group_sem and sp_stat_sem read lock subsequently in proc_stat_show(), since it just has effect on the process of debug feature. Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- mm/share_pool.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index 40147ef85ba5..f33b4b308dcf 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -4118,7 +4118,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; /* to prevent ABBA deadlock, first hold sp_group_sem */ - down_read(&sp_group_sem); mutex_lock(&spg_stat->lock); hash_for_each(spg_stat->hash, i, spg_proc_stat, gnode) { proc_stat = spg_proc_stat->proc_stat; @@ -4147,7 +4146,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) seq_putc(seq, '\n'); } mutex_unlock(&spg_stat->lock); - up_read(&sp_group_sem); return 0; } @@ -4165,10 +4163,16 @@ static int proc_stat_show(struct seq_file *seq, void *offset) byte2kb(atomic64_read(&kthread_stat.alloc_size)), byte2kb(atomic64_read(&kthread_stat.k2u_size))); - /* pay attention to potential ABBA deadlock */ + /* + * This ugly code is just for fixing the ABBA deadlock against + * sp_group_add_task. + */ + down_read(&sp_group_sem); down_read(&sp_spg_stat_sem); idr_for_each(&sp_spg_stat_idr, idr_proc_stat_cb, seq); up_read(&sp_spg_stat_sem); + up_read(&sp_group_sem); + return 0; } -- Gitee From 1177aad150e5bb5458f80bd92e4be42eb2142fef Mon Sep 17 00:00:00 2001 From: Guo Mengqi Date: Sat, 7 May 2022 08:24:41 +0000 Subject: [PATCH 312/340] sharepool: fix hisi oom deadlock hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I54I59 CVE: NA -------------------------------------------------- When hisi_oom_notifier_call calls spg_overview_show, it requires the global rwsem sp_group_sem, which had been held by another process when oomed. This leads to kernel hungtask. At another position the unecessary sp_group_sem causes an AA deadlock. [ 1934.549016] INFO: task klogd:2757 blocked for more than 120 seconds. [ 1934.562408] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1934.570231] klogd D 0 2757 2746 0x00000000 [ 1934.575707] Call trace: [ 1934.578162] __switch_to+0xe8/0x150 [ 1934.581648] __schedule+0x250/0x558 [ 1934.585133] schedule+0x30/0xf0 [ 1934.588267] rwsem_down_read_failed+0x10c/0x188 [ 1934.592788] down_read+0x60/0x68 [ 1934.596015] spg_overview_show.part.31+0xc8/0xf8 [ 1934.600622] spg_overview_show+0x2c/0x38 [ 1934.604543] hisi_oom_notifier_call+0xe8/0x120 [ 1934.608975] out_of_memory+0x7c/0x570 [ 1934.612631] __alloc_pages_nodemask+0xcfc/0xd98 [ 1934.617158] alloc_pages_current+0x88/0xf0 [ 1934.621246] __page_cache_alloc+0x8c/0xd8 [ 1934.625247] page_cache_alloc_inode+0x48/0x58 [ 1934.629595] filemap_fault+0x360/0x8e0 [ 1934.633341] ext4_filemap_fault+0x38/0x128 [ 1934.637431] __do_fault+0x50/0x218 [ 1934.640822] __handle_mm_fault+0x69c/0x9c8 [ 1934.644909] handle_mm_fault+0xf8/0x200 [ 1934.648740] do_page_fault+0x220/0x508 [ 1934.652477] do_translation_fault+0xa8/0xbc [ 1934.656652] do_mem_abort+0x68/0x118 [ 1934.660216] do_el0_ia_bp_hardening+0x6c/0xd8 [ 1934.664565] el0_ia+0x20/0x24 Signed-off-by: Guo Mengqi Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- mm/share_pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index f33b4b308dcf..54d137eae3cf 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -4077,9 +4077,9 @@ void spg_overview_show(struct seq_file *seq) atomic_read(&sp_overall_stat.spa_total_num)); } - down_read(&sp_group_sem); + down_read(&sp_spg_stat_sem); idr_for_each(&sp_spg_stat_idr, idr_spg_stat_cb, seq); - up_read(&sp_group_sem); + up_read(&sp_spg_stat_sem); if (seq != NULL) seq_puts(seq, "\n"); -- Gitee From 1811840c2cdebd0820818392a8217ffbd1be5c67 Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Sat, 7 May 2022 08:24:42 +0000 Subject: [PATCH 313/340] mm/sharepool: Fix sharepool node id invalid when using sp_alloc hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I54IL8 CVE: NA ----------------------------- When passing numa id to sp_alloc, sometimes numa id does not work. This is because memory policy will change numa id to a preferred one if memory policy is set. Fix the error by mbind virtual address to desired numa id. Signed-off-by: Zhang Jian Signed-off-by: Zhou Guanghui Signed-off-by: Wang Wensheng Reviewed-by: Weilong Chen Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- include/linux/mempolicy.h | 10 ++++++ mm/mempolicy.c | 14 ++++++--- mm/share_pool.c | 65 ++++++++++++++++++++++++--------------- 3 files changed, 61 insertions(+), 28 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 00437ae80b6f..17ec6bb919d7 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -219,6 +219,9 @@ static inline bool vma_migratable(struct vm_area_struct *vma) extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); +extern long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm); #else struct mempolicy {}; @@ -322,5 +325,12 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, static inline void mpol_put_task_policy(struct task_struct *task) { } + +static long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm) +{ + return 0; +} #endif /* CONFIG_NUMA */ #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 76a577cfc277..36961d6d364e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1259,11 +1259,10 @@ static struct page *new_page(struct page *page, unsigned long start) } #endif -static long do_mbind(unsigned long start, unsigned long len, - unsigned short mode, unsigned short mode_flags, - nodemask_t *nmask, unsigned long flags) +long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm) { - struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; int err; @@ -1364,6 +1363,13 @@ static long do_mbind(unsigned long start, unsigned long len, return err; } +static long do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags) +{ + return __do_mbind(start, len, mode, mode_flags, nmask, flags, current->mm); +} + /* * User space interface with variable sized bitmaps for nodelists. */ diff --git a/mm/share_pool.c b/mm/share_pool.c index 54d137eae3cf..833ecb7dd859 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -15,7 +15,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #define pr_fmt(fmt) "share pool: " fmt #include @@ -2174,6 +2173,7 @@ struct sp_alloc_context { bool need_fallocate; struct timespec64 start; struct timespec64 end; + bool have_mbind; }; static void trace_sp_alloc_begin(struct sp_alloc_context *ac) @@ -2316,6 +2316,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, ac->sp_flags = sp_flags; ac->state = ALLOC_NORMAL; ac->need_fallocate = false; + ac->have_mbind = false; return 0; } @@ -2409,7 +2410,7 @@ static void sp_alloc_fallback(struct sp_area *spa, struct sp_alloc_context *ac) } static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, - struct sp_group_node *spg_node, struct sp_alloc_context *ac) + struct sp_alloc_context *ac) { int ret = 0; unsigned long sp_addr = spa->va_start; @@ -2441,25 +2442,19 @@ static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, if (ret) sp_add_work_compact(); } - if (ret) { - if (spa->spg != spg_none) - sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); - else - sp_munmap(mm, spa->va_start, spa->real_size); - - if (unlikely(fatal_signal_pending(current))) - pr_warn_ratelimited("allocation failed, current thread is killed\n"); - else - pr_warn_ratelimited("allocation failed due to mm populate failed" - "(potential no enough memory when -12): %d\n", ret); - sp_fallocate(spa); /* need this, otherwise memleak */ - sp_alloc_fallback(spa, ac); - } else { - ac->need_fallocate = true; - } return ret; } +static long sp_mbind(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned long node) +{ + nodemask_t nmask; + + nodes_clear(nmask); + node_set(node, nmask); + return __do_mbind(start, len, MPOL_BIND, MPOL_F_STATIC_NODES, + &nmask, MPOL_MF_STRICT, mm); +} + static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, struct sp_group_node *spg_node, struct sp_alloc_context *ac) { @@ -2475,7 +2470,34 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, return ret; } - ret = sp_alloc_populate(mm, spa, spg_node, ac); + if (!ac->have_mbind) { + ret = sp_mbind(mm, spa->va_start, spa->real_size, spa->node_id); + if (ret < 0) { + pr_err("cannot bind the memory range to specified node:%d, err:%d\n", + spa->node_id, ret); + goto err; + } + ac->have_mbind = true; + } + + ret = sp_alloc_populate(mm, spa, ac); + if (ret) { +err: + if (spa->spg != spg_none) + sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); + else + sp_munmap(mm, spa->va_start, spa->real_size); + + if (unlikely(fatal_signal_pending(current))) + pr_warn_ratelimited("allocation failed, current thread is killed\n"); + else + pr_warn_ratelimited("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n", + ret); + sp_fallocate(spa); /* need this, otherwise memleak */ + sp_alloc_fallback(spa, ac); + } else + ac->need_fallocate = true; + return ret; } @@ -2497,11 +2519,6 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, if (mmap_ret) { if (ac->state != ALLOC_COREDUMP) return mmap_ret; - if (ac->spg == spg_none) { - sp_alloc_unmap(mm, spa, spg_node); - pr_err("dvpp allocation failed due to coredump"); - return mmap_ret; - } ac->state = ALLOC_NORMAL; continue; } -- Gitee From 42dc8522429a6a8ab62d625b13335da273331719 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 7 May 2022 09:12:20 +0000 Subject: [PATCH 314/340] io_uring: fix false WARN_ONCE mainline inclusion from mainline-v5.13-rc6 commit e6ab8991c5d0b0deae0961dc22c0edd1dee328f5 category: bugfix bugzilla: 186576, https://gitee.com/openeuler/kernel/issues/I554BJ CVE: NA -------------------------------- io_uring: fix false WARN_ONCE WARNING: CPU: 1 PID: 11749 at fs/io-wq.c:244 io_wqe_wake_worker fs/io-wq.c:244 [inline] WARNING: CPU: 1 PID: 11749 at fs/io-wq.c:244 io_wqe_enqueue+0x7f6/0x910 fs/io-wq.c:751 A WARN_ON_ONCE() in io_wqe_wake_worker() can be triggered by a valid userspace setup. Replace it with pr_warn. Reported-by: syzbot+ea2f1484cffe5109dc10@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f7ede342c3342c4c26668f5168e2993e38bbd99c.1623949695.git.asml.silence@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Guo Xuenan Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/io-wq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 9c223dfb4b3f..b6e311e57b47 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -282,7 +282,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) * Most likely an attempt to queue unbounded work on an io_wq that * wasn't setup with any unbounded workers. */ - WARN_ON_ONCE(!acct->max_workers); + if (unlikely(!acct->max_workers)) + pr_warn_once("io-wq is not configured for unbound workers"); rcu_read_lock(); ret = io_wqe_activate_free_worker(wqe); @@ -1050,6 +1051,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (WARN_ON_ONCE(!data->free_work || !data->do_work)) return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(!bounded)) + return ERR_PTR(-EINVAL); wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) -- Gitee From f25fd7563d1dee376593b19cca7c3ff789ddd998 Mon Sep 17 00:00:00 2001 From: Slawomir Mrozowicz Date: Mon, 9 May 2022 03:38:43 +0000 Subject: [PATCH 315/340] ixgbe: add the ability for the PF to disable VF link state mainline inclusion from mainline-v5.18-rc1 commit 366fd1000995d4cf64e1a61a0d78a051550b9841 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I532H9 CVE: CVE-2021-33061 -------------------------------- Add support for ndo_set_vf_link_state the Network Device Option that allows the PF driver to control the virtual link state of the VF devices. Without this change a VF cannot be disabled/enabled by the administrator. In the implementation the auto state takes over PF link state to VF link setting, the enable state is not supported, the disable state shut off the VF link regardless of the PF setting. Signed-off-by: Slawomir Mrozowicz Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen Conflicts: drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c Signed-off-by: Ziyang Xuan Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 2 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 11 +- drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h | 2 + .../net/ethernet/intel/ixgbe/ixgbe_sriov.c | 206 ++++++++++++++---- .../net/ethernet/intel/ixgbe/ixgbe_sriov.h | 4 +- 5 files changed, 181 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 4fc906c6166b..9e7b7c224a31 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -179,6 +179,8 @@ struct vf_data_storage { u16 pf_vlan; /* When set, guest VLAN config not allowed. */ u16 pf_qos; u16 tx_rate; + int link_enable; + int link_state; u8 spoofchk_enabled; bool rss_query_enabled; u8 trusted; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index b52585fbb295..d8f5df230a7d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -5622,6 +5622,9 @@ static void ixgbe_up_complete(struct ixgbe_adapter *adapter) ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT); ctrl_ext |= IXGBE_CTRL_EXT_PFRSTD; IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, ctrl_ext); + + /* update setting rx tx for all active vfs */ + ixgbe_set_all_vfs(adapter); } void ixgbe_reinit_locked(struct ixgbe_adapter *adapter) @@ -6074,11 +6077,8 @@ void ixgbe_down(struct ixgbe_adapter *adapter) for (i = 0 ; i < adapter->num_vfs; i++) adapter->vfinfo[i].clear_to_send = false; - /* ping all the active vfs to let them know we are going down */ - ixgbe_ping_all_vfs(adapter); - - /* Disable all VFTE/VFRE TX/RX */ - ixgbe_disable_tx_rx(adapter); + /* update setting rx tx for all active vfs */ + ixgbe_set_all_vfs(adapter); } /* disable transmits in the hardware now that interrupts are off */ @@ -10288,6 +10288,7 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_set_vf_vlan = ixgbe_ndo_set_vf_vlan, .ndo_set_vf_rate = ixgbe_ndo_set_vf_bw, .ndo_set_vf_spoofchk = ixgbe_ndo_set_vf_spoofchk, + .ndo_set_vf_link_state = ixgbe_ndo_set_vf_link_state, .ndo_set_vf_rss_query_en = ixgbe_ndo_set_vf_rss_query_en, .ndo_set_vf_trust = ixgbe_ndo_set_vf_trust, .ndo_get_vf_config = ixgbe_ndo_get_vf_config, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h index e085b6520dac..2246b07beae4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h @@ -80,6 +80,8 @@ enum ixgbe_pfvf_api_rev { #define IXGBE_VF_UPDATE_XCAST_MODE 0x0c +#define IXGBE_VF_GET_LINK_STATE 0x10 /* get vf link state */ + /* length of permanent address message returned from PF */ #define IXGBE_VF_PERMADDR_MSG_LEN 4 /* word in permanent address message with the current multicast type */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index 8aaf856771d7..1f0e84120b32 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -96,6 +96,7 @@ static int __ixgbe_enable_sriov(struct ixgbe_adapter *adapter, for (i = 0; i < num_vfs; i++) { /* enable spoof checking for all VFs */ adapter->vfinfo[i].spoofchk_enabled = true; + adapter->vfinfo[i].link_enable = true; /* We support VF RSS querying only for 82599 and x540 * devices at the moment. These devices share RSS @@ -816,6 +817,57 @@ static inline void ixgbe_write_qde(struct ixgbe_adapter *adapter, u32 vf, } } +/** + * ixgbe_set_vf_rx_tx - Set VF rx tx + * @adapter: Pointer to adapter struct + * @vf: VF identifier + * + * Set or reset correct transmit and receive for vf + **/ +static void ixgbe_set_vf_rx_tx(struct ixgbe_adapter *adapter, int vf) +{ + u32 reg_cur_tx, reg_cur_rx, reg_req_tx, reg_req_rx; + struct ixgbe_hw *hw = &adapter->hw; + u32 reg_offset, vf_shift; + + vf_shift = vf % 32; + reg_offset = vf / 32; + + reg_cur_tx = IXGBE_READ_REG(hw, IXGBE_VFTE(reg_offset)); + reg_cur_rx = IXGBE_READ_REG(hw, IXGBE_VFRE(reg_offset)); + + if (adapter->vfinfo[vf].link_enable) { + reg_req_tx = reg_cur_tx | 1 << vf_shift; + reg_req_rx = reg_cur_rx | 1 << vf_shift; + } else { + reg_req_tx = reg_cur_tx & ~(1 << vf_shift); + reg_req_rx = reg_cur_rx & ~(1 << vf_shift); + } + + /* The 82599 cannot support a mix of jumbo and non-jumbo PF/VFs. + * For more info take a look at ixgbe_set_vf_lpe + */ + if (adapter->hw.mac.type == ixgbe_mac_82599EB) { + struct net_device *dev = adapter->netdev; + int pf_max_frame = dev->mtu + ETH_HLEN; + +#if IS_ENABLED(CONFIG_FCOE) + if (dev->features & NETIF_F_FCOE_MTU) + pf_max_frame = max_t(int, pf_max_frame, + IXGBE_FCOE_JUMBO_FRAME_SIZE); +#endif /* CONFIG_FCOE */ + + if (pf_max_frame > ETH_FRAME_LEN) + reg_req_rx = reg_cur_rx & ~(1 << vf_shift); + } + + /* Enable/Disable particular VF */ + if (reg_cur_tx != reg_req_tx) + IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), reg_req_tx); + if (reg_cur_rx != reg_req_rx) + IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), reg_req_rx); +} + static int ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf) { struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ]; @@ -841,11 +893,6 @@ static int ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf) vf_shift = vf % 32; reg_offset = vf / 32; - /* enable transmit for vf */ - reg = IXGBE_READ_REG(hw, IXGBE_VFTE(reg_offset)); - reg |= BIT(vf_shift); - IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), reg); - /* force drop enable for all VF Rx queues */ reg = IXGBE_QDE_ENABLE; if (adapter->vfinfo[vf].pf_vlan) @@ -853,27 +900,7 @@ static int ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf) ixgbe_write_qde(adapter, vf, reg); - /* enable receive for vf */ - reg = IXGBE_READ_REG(hw, IXGBE_VFRE(reg_offset)); - reg |= BIT(vf_shift); - /* - * The 82599 cannot support a mix of jumbo and non-jumbo PF/VFs. - * For more info take a look at ixgbe_set_vf_lpe - */ - if (adapter->hw.mac.type == ixgbe_mac_82599EB) { - struct net_device *dev = adapter->netdev; - int pf_max_frame = dev->mtu + ETH_HLEN; - -#ifdef CONFIG_FCOE - if (dev->features & NETIF_F_FCOE_MTU) - pf_max_frame = max_t(int, pf_max_frame, - IXGBE_FCOE_JUMBO_FRAME_SIZE); - -#endif /* CONFIG_FCOE */ - if (pf_max_frame > ETH_FRAME_LEN) - reg &= ~BIT(vf_shift); - } - IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), reg); + ixgbe_set_vf_rx_tx(adapter, vf); /* enable VF mailbox for further messages */ adapter->vfinfo[vf].clear_to_send = true; @@ -1193,6 +1220,25 @@ static int ixgbe_update_vf_xcast_mode(struct ixgbe_adapter *adapter, return 0; } +static int ixgbe_get_vf_link_state(struct ixgbe_adapter *adapter, + u32 *msgbuf, u32 vf) +{ + u32 *link_state = &msgbuf[1]; + + /* verify the PF is supporting the correct API */ + switch (adapter->vfinfo[vf].vf_api) { + case ixgbe_mbox_api_12: + case ixgbe_mbox_api_13: + break; + default: + return -EOPNOTSUPP; + } + + *link_state = adapter->vfinfo[vf].link_enable; + + return 0; +} + static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf) { u32 mbx_size = IXGBE_VFMAILBOX_SIZE; @@ -1258,6 +1304,9 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf) case IXGBE_VF_UPDATE_XCAST_MODE: retval = ixgbe_update_vf_xcast_mode(adapter, msgbuf, vf); break; + case IXGBE_VF_GET_LINK_STATE: + retval = ixgbe_get_vf_link_state(adapter, msgbuf, vf); + break; default: e_err(drv, "Unhandled Msg %8.8x\n", msgbuf[0]); retval = IXGBE_ERR_MBX; @@ -1307,18 +1356,6 @@ void ixgbe_msg_task(struct ixgbe_adapter *adapter) } } -void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter) -{ - struct ixgbe_hw *hw = &adapter->hw; - - /* disable transmit and receive for all vfs */ - IXGBE_WRITE_REG(hw, IXGBE_VFTE(0), 0); - IXGBE_WRITE_REG(hw, IXGBE_VFTE(1), 0); - - IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), 0); - IXGBE_WRITE_REG(hw, IXGBE_VFRE(1), 0); -} - static inline void ixgbe_ping_vf(struct ixgbe_adapter *adapter, int vf) { struct ixgbe_hw *hw = &adapter->hw; @@ -1344,6 +1381,21 @@ void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter) } } +/** + * ixgbe_set_all_vfs - update vfs queues + * @adapter: Pointer to adapter struct + * + * Update setting transmit and receive queues for all vfs + **/ +void ixgbe_set_all_vfs(struct ixgbe_adapter *adapter) +{ + int i; + + for (i = 0 ; i < adapter->num_vfs; i++) + ixgbe_set_vf_link_state(adapter, i, + adapter->vfinfo[i].link_state); +} + int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac) { struct ixgbe_adapter *adapter = netdev_priv(netdev); @@ -1641,6 +1693,84 @@ int ixgbe_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting) return 0; } +/** + * ixgbe_set_vf_link_state - Set link state + * @adapter: Pointer to adapter struct + * @vf: VF identifier + * @state: required link state + * + * Set a link force state on/off a single vf + **/ +void ixgbe_set_vf_link_state(struct ixgbe_adapter *adapter, int vf, int state) +{ + adapter->vfinfo[vf].link_state = state; + + switch (state) { + case IFLA_VF_LINK_STATE_AUTO: + if (test_bit(__IXGBE_DOWN, &adapter->state)) + adapter->vfinfo[vf].link_enable = false; + else + adapter->vfinfo[vf].link_enable = true; + break; + case IFLA_VF_LINK_STATE_ENABLE: + adapter->vfinfo[vf].link_enable = true; + break; + case IFLA_VF_LINK_STATE_DISABLE: + adapter->vfinfo[vf].link_enable = false; + break; + } + + ixgbe_set_vf_rx_tx(adapter, vf); + + /* restart the VF */ + adapter->vfinfo[vf].clear_to_send = false; + ixgbe_ping_vf(adapter, vf); +} + +/** + * ixgbe_ndo_set_vf_link_state - Set link state + * @netdev: network interface device structure + * @vf: VF identifier + * @state: required link state + * + * Set the link state of a specified VF, regardless of physical link state + **/ +int ixgbe_ndo_set_vf_link_state(struct net_device *netdev, int vf, int state) +{ + struct ixgbe_adapter *adapter = netdev_priv(netdev); + int ret = 0; + + if (vf < 0 || vf >= adapter->num_vfs) { + dev_err(&adapter->pdev->dev, + "NDO set VF link - invalid VF identifier %d\n", vf); + return -EINVAL; + } + + switch (state) { + case IFLA_VF_LINK_STATE_ENABLE: + dev_info(&adapter->pdev->dev, + "NDO set VF %d link state %d - not supported\n", + vf, state); + break; + case IFLA_VF_LINK_STATE_DISABLE: + dev_info(&adapter->pdev->dev, + "NDO set VF %d link state disable\n", vf); + ixgbe_set_vf_link_state(adapter, vf, state); + break; + case IFLA_VF_LINK_STATE_AUTO: + dev_info(&adapter->pdev->dev, + "NDO set VF %d link state auto\n", vf); + ixgbe_set_vf_link_state(adapter, vf, state); + break; + default: + dev_err(&adapter->pdev->dev, + "NDO set VF %d - invalid link state %d\n", vf, state); + ret = -EINVAL; + } + + return ret; +} + int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf, bool setting) { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h index 3ec21923c89c..0690ecb8dfa3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h @@ -17,8 +17,8 @@ void ixgbe_restore_vf_multicasts(struct ixgbe_adapter *adapter); #endif void ixgbe_msg_task(struct ixgbe_adapter *adapter); int ixgbe_vf_configuration(struct pci_dev *pdev, unsigned int event_mask); -void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter); void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter); +void ixgbe_set_all_vfs(struct ixgbe_adapter *adapter); int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac); int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan, u8 qos, __be16 vlan_proto); @@ -31,7 +31,9 @@ int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf, int ixgbe_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting); int ixgbe_ndo_get_vf_config(struct net_device *netdev, int vf, struct ifla_vf_info *ivi); +int ixgbe_ndo_set_vf_link_state(struct net_device *netdev, int vf, int state); void ixgbe_check_vf_rate_limit(struct ixgbe_adapter *adapter); +void ixgbe_set_vf_link_state(struct ixgbe_adapter *adapter, int vf, int state); int ixgbe_disable_sriov(struct ixgbe_adapter *adapter); #ifdef CONFIG_PCI_IOV void ixgbe_enable_sriov(struct ixgbe_adapter *adapter, unsigned int max_vfs); -- Gitee From 2ece2378f8da40d893135e4e3a98daa2b6dc3d58 Mon Sep 17 00:00:00 2001 From: Slawomir Mrozowicz Date: Mon, 9 May 2022 03:38:44 +0000 Subject: [PATCH 316/340] ixgbe: add improvement for MDD response functionality mainline inclusion from mainline-v5.18-rc1 commit 008ca35f6e87be1d60b6af3d1ae247c6d5c2531d category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I532H9 CVE: CVE-2021-33061 -------------------------------- The 82599 PF driver disable VF driver after a special MDD event occurs. Adds the option for administrators to control whether VFs are automatically disabled after several MDD events. The automatically disabling is now the default mode for 82599 PF driver, as it is more reliable. This addresses CVE-2021-33061. Signed-off-by: Slawomir Mrozowicz Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen Conflicts: drivers/net/ethernet/intel/ixgbe/ixgbe.h drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c drivers/net/ethernet/intel/ixgbe/ixgbe_main.c Signed-off-by: Ziyang Xuan Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 4 +++ .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 21 ++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 28 ++++++++++++++++++- 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 9e7b7c224a31..074e23b4534b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -186,6 +186,7 @@ struct vf_data_storage { u8 trusted; int xcast_mode; unsigned int vf_api; + u8 primary_abort_count; }; enum ixgbevf_xcast_modes { @@ -548,6 +549,8 @@ struct ixgbe_mac_addr { #define IXGBE_TRY_LINK_TIMEOUT (4 * HZ) #define IXGBE_SFP_POLL_JIFFIES (2 * HZ) /* SFP poll every 2 seconds */ +#define IXGBE_PRIMARY_ABORT_LIMIT 5 + /* board specific private data structure */ struct ixgbe_adapter { unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; @@ -607,6 +610,7 @@ struct ixgbe_adapter { #define IXGBE_FLAG2_EEE_ENABLED BIT(15) #define IXGBE_FLAG2_RX_LEGACY BIT(16) #define IXGBE_FLAG2_IPSEC_ENABLED BIT(17) +#define IXGBE_FLAG2_AUTO_DISABLE_VF BIT(19) /* Tx fast path data */ int num_tx_queues; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 8829bd95d0d3..73e769212c65 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -136,6 +136,8 @@ static const char ixgbe_gstrings_test[][ETH_GSTRING_LEN] = { static const char ixgbe_priv_flags_strings[][ETH_GSTRING_LEN] = { #define IXGBE_PRIV_FLAGS_LEGACY_RX BIT(0) "legacy-rx", +#define IXGBE_PRIV_FLAGS_AUTO_DISABLE_VF BIT(2) + "mdd-disable-vf", }; #define IXGBE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(ixgbe_priv_flags_strings) @@ -3410,6 +3412,9 @@ static u32 ixgbe_get_priv_flags(struct net_device *netdev) if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY) priv_flags |= IXGBE_PRIV_FLAGS_LEGACY_RX; + if (adapter->flags2 & IXGBE_FLAG2_AUTO_DISABLE_VF) + priv_flags |= IXGBE_PRIV_FLAGS_AUTO_DISABLE_VF; + return priv_flags; } @@ -3417,11 +3422,27 @@ static int ixgbe_set_priv_flags(struct net_device *netdev, u32 priv_flags) { struct ixgbe_adapter *adapter = netdev_priv(netdev); unsigned int flags2 = adapter->flags2; + unsigned int i; flags2 &= ~IXGBE_FLAG2_RX_LEGACY; if (priv_flags & IXGBE_PRIV_FLAGS_LEGACY_RX) flags2 |= IXGBE_FLAG2_RX_LEGACY; + flags2 &= ~IXGBE_FLAG2_AUTO_DISABLE_VF; + if (priv_flags & IXGBE_PRIV_FLAGS_AUTO_DISABLE_VF) { + if (adapter->hw.mac.type == ixgbe_mac_82599EB) { + /* Reset primary abort counter */ + for (i = 0; i < adapter->num_vfs; i++) + adapter->vfinfo[i].primary_abort_count = 0; + + flags2 |= IXGBE_FLAG2_AUTO_DISABLE_VF; + } else { + e_info(probe, + "Cannot set private flags: Operation not supported\n"); + return -EOPNOTSUPP; + } + } + if (flags2 != adapter->flags2) { adapter->flags2 = flags2; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index d8f5df230a7d..9455c6191694 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -7565,6 +7565,27 @@ static void ixgbe_watchdog_flush_tx(struct ixgbe_adapter *adapter) } #ifdef CONFIG_PCI_IOV +static void ixgbe_bad_vf_abort(struct ixgbe_adapter *adapter, u32 vf) +{ + struct ixgbe_hw *hw = &adapter->hw; + + if (adapter->hw.mac.type == ixgbe_mac_82599EB && + adapter->flags2 & IXGBE_FLAG2_AUTO_DISABLE_VF) { + adapter->vfinfo[vf].primary_abort_count++; + if (adapter->vfinfo[vf].primary_abort_count == + IXGBE_PRIMARY_ABORT_LIMIT) { + ixgbe_set_vf_link_state(adapter, vf, + IFLA_VF_LINK_STATE_DISABLE); + adapter->vfinfo[vf].primary_abort_count = 0; + + e_info(drv, + "Malicious Driver Detection event detected on PF %d VF %d MAC: %pM mdd-disable-vf=on", + hw->bus.func, vf, + adapter->vfinfo[vf].vf_mac_addresses); + } + } +} + static void ixgbe_check_for_bad_vf(struct ixgbe_adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; @@ -7596,8 +7617,10 @@ static void ixgbe_check_for_bad_vf(struct ixgbe_adapter *adapter) continue; pci_read_config_word(vfdev, PCI_STATUS, &status_reg); if (status_reg != IXGBE_FAILED_READ_CFG_WORD && - status_reg & PCI_STATUS_REC_MASTER_ABORT) + status_reg & PCI_STATUS_REC_MASTER_ABORT) { + ixgbe_bad_vf_abort(adapter, vf); pcie_flr(vfdev); + } } } @@ -10602,6 +10625,9 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; + if (adapter->hw.mac.type == ixgbe_mac_82599EB) + adapter->flags2 |= IXGBE_FLAG2_AUTO_DISABLE_VF; + /* Make sure the SWFW semaphore is in a valid state */ if (hw->mac.ops.init_swfw_sync) hw->mac.ops.init_swfw_sync(hw); -- Gitee From 15f6c3120c2989460409c4967dd8d776af1896e1 Mon Sep 17 00:00:00 2001 From: Slawomir Mrozowicz Date: Mon, 9 May 2022 03:38:45 +0000 Subject: [PATCH 317/340] ixgbevf: add disable link state mainline inclusion from mainline-v5.18-rc1 commit 443ebdd68b443ea0798c883e8aabf10d75268e92 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I532H9 CVE: CVE-2021-33061 -------------------------------- Add possibility to disable link state if it is administratively disabled in PF. It is part of the general functionality that allows the PF driver to control the state of the virtual link VF devices. Signed-off-by: Slawomir Mrozowicz Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen Conflicts: drivers/net/ethernet/intel/ixgbevf/mbx.h drivers/net/ethernet/intel/ixgbevf/vf.c Signed-off-by: Ziyang Xuan Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 2 + .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 11 ++++- drivers/net/ethernet/intel/ixgbevf/mbx.h | 2 + drivers/net/ethernet/intel/ixgbevf/vf.c | 42 +++++++++++++++++++ drivers/net/ethernet/intel/ixgbevf/vf.h | 1 + 5 files changed, 57 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 56a1031dcc07..469df8a242e8 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -383,6 +383,8 @@ struct ixgbevf_adapter { u32 *rss_key; u8 rss_indir_tbl[IXGBEVF_X550_VFRETA_SIZE]; u32 flags; + bool link_state; + #define IXGBEVF_FLAGS_LEGACY_RX BIT(1) }; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 25785fee5422..450c0a95d36d 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -2263,7 +2263,9 @@ static void ixgbevf_negotiate_api(struct ixgbevf_adapter *adapter) static void ixgbevf_up_complete(struct ixgbevf_adapter *adapter) { struct net_device *netdev = adapter->netdev; + struct pci_dev *pdev = adapter->pdev; struct ixgbe_hw *hw = &adapter->hw; + bool state; ixgbevf_configure_msix(adapter); @@ -2276,6 +2278,11 @@ static void ixgbevf_up_complete(struct ixgbevf_adapter *adapter) spin_unlock_bh(&adapter->mbx_lock); + state = adapter->link_state; + hw->mac.ops.get_link_state(hw, &adapter->link_state); + if (state && state != adapter->link_state) + dev_info(&pdev->dev, "VF is administratively disabled\n"); + smp_mb__before_atomic(); clear_bit(__IXGBEVF_DOWN, &adapter->state); ixgbevf_napi_enable_all(adapter); @@ -3045,6 +3052,8 @@ static int ixgbevf_sw_init(struct ixgbevf_adapter *adapter) adapter->tx_ring_count = IXGBEVF_DEFAULT_TXD; adapter->rx_ring_count = IXGBEVF_DEFAULT_RXD; + adapter->link_state = true; + set_bit(__IXGBEVF_DOWN, &adapter->state); return 0; @@ -3277,7 +3286,7 @@ static void ixgbevf_watchdog_subtask(struct ixgbevf_adapter *adapter) ixgbevf_watchdog_update_link(adapter); - if (adapter->link_up) + if (adapter->link_up && adapter->link_state) ixgbevf_watchdog_link_is_up(adapter); else ixgbevf_watchdog_link_is_down(adapter); diff --git a/drivers/net/ethernet/intel/ixgbevf/mbx.h b/drivers/net/ethernet/intel/ixgbevf/mbx.h index bfd9ae150808..464841e21315 100644 --- a/drivers/net/ethernet/intel/ixgbevf/mbx.h +++ b/drivers/net/ethernet/intel/ixgbevf/mbx.h @@ -92,6 +92,8 @@ enum ixgbe_pfvf_api_rev { #define IXGBE_VF_UPDATE_XCAST_MODE 0x0c +#define IXGBE_VF_GET_LINK_STATE 0x10 /* get vf link state */ + /* length of permanent address message returned from PF */ #define IXGBE_VF_PERMADDR_MSG_LEN 4 /* word in permanent address message with the current multicast type */ diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c index 7597d099c584..c96a4c3ca075 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.c +++ b/drivers/net/ethernet/intel/ixgbevf/vf.c @@ -570,6 +570,46 @@ static s32 ixgbevf_hv_update_xcast_mode(struct ixgbe_hw *hw, int xcast_mode) return -EOPNOTSUPP; } +/** + * ixgbevf_get_link_state_vf - Get VF link state from PF + * @hw: pointer to the HW structure + * @link_state: link state storage + * + * Returns state of the operation error or success. + */ +static s32 ixgbevf_get_link_state_vf(struct ixgbe_hw *hw, bool *link_state) +{ + u32 msgbuf[2]; + s32 ret_val; + s32 err; + + msgbuf[0] = IXGBE_VF_GET_LINK_STATE; + msgbuf[1] = 0x0; + + err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2); + + if (err || (msgbuf[0] & IXGBE_VT_MSGTYPE_NACK)) { + ret_val = IXGBE_ERR_MBX; + } else { + ret_val = 0; + *link_state = msgbuf[1]; + } + + return ret_val; +} + +/** + * ixgbevf_hv_get_link_state_vf - * Hyper-V variant - just a stub. + * @hw: unused + * @link_state: unused + * + * Hyper-V variant; there is no mailbox communication. + */ +static s32 ixgbevf_hv_get_link_state_vf(struct ixgbe_hw *hw, bool *link_state) +{ + return -EOPNOTSUPP; +} + /** * ixgbevf_set_vfta_vf - Set/Unset VLAN filter table address * @hw: pointer to the HW structure @@ -946,6 +986,7 @@ static const struct ixgbe_mac_operations ixgbevf_mac_ops = { .set_rar = ixgbevf_set_rar_vf, .update_mc_addr_list = ixgbevf_update_mc_addr_list_vf, .update_xcast_mode = ixgbevf_update_xcast_mode, + .get_link_state = ixgbevf_get_link_state_vf, .set_uc_addr = ixgbevf_set_uc_addr_vf, .set_vfta = ixgbevf_set_vfta_vf, .set_rlpml = ixgbevf_set_rlpml_vf, @@ -963,6 +1004,7 @@ static const struct ixgbe_mac_operations ixgbevf_hv_mac_ops = { .set_rar = ixgbevf_hv_set_rar_vf, .update_mc_addr_list = ixgbevf_hv_update_mc_addr_list_vf, .update_xcast_mode = ixgbevf_hv_update_xcast_mode, + .get_link_state = ixgbevf_hv_get_link_state_vf, .set_uc_addr = ixgbevf_hv_set_uc_addr_vf, .set_vfta = ixgbevf_hv_set_vfta_vf, .set_rlpml = ixgbevf_hv_set_rlpml_vf, diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.h b/drivers/net/ethernet/intel/ixgbevf/vf.h index d1e9e306653b..45d9269218db 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.h +++ b/drivers/net/ethernet/intel/ixgbevf/vf.h @@ -42,6 +42,7 @@ struct ixgbe_mac_operations { s32 (*init_rx_addrs)(struct ixgbe_hw *); s32 (*update_mc_addr_list)(struct ixgbe_hw *, struct net_device *); s32 (*update_xcast_mode)(struct ixgbe_hw *, int); + s32 (*get_link_state)(struct ixgbe_hw *hw, bool *link_state); s32 (*enable_mc)(struct ixgbe_hw *); s32 (*disable_mc)(struct ixgbe_hw *); s32 (*clear_vfta)(struct ixgbe_hw *); -- Gitee From 857d0cd57976451a339959d6509529f11c1548c2 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Mon, 9 May 2022 11:30:08 +0000 Subject: [PATCH 318/340] mm: refactor the reclaim thread of page cache from per-cpu to per-node hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA ---------------------------------------------- In feature page cache limit, the reclaim work is per-cpu, it is unnecessary and will result some performance degradation in some case, such as storage (some cpus are devote for dead loop, no other works should be excuted on these cpus). In order to solve this, change the reclaim work thread from per-cpu to per-node and meanwhile dont bind these work thread to any cpus. Signed-off-by: Chen Wandun Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/vmscan.c | 61 +++++++++++++++-------------------------------------- 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d59b22a7ba9a..42f24473756b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,7 +185,7 @@ int vm_cache_reclaim_weight __read_mostly; int vm_cache_reclaim_weight_min; int vm_cache_reclaim_weight_max; int vm_cache_reclaim_enable; -static DEFINE_PER_CPU(struct delayed_work, vmscan_work); +static struct work_struct vmscan_works[MAX_NUMNODES]; #endif static LIST_HEAD(shrinker_list); @@ -3972,8 +3972,8 @@ static unsigned long __shrink_page_cache(gfp_t mask) .gfp_mask = current_gfp_context(mask), .reclaim_idx = gfp_zone(mask), .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX * - (unsigned long)vm_cache_reclaim_weight, + .nr_to_reclaim = SWAP_CLUSTER_MAX * nr_cpus_node(numa_node_id()) * + vm_cache_reclaim_weight, .may_unmap = 1, .may_swap = mem_reliable_is_enabled() ? 0 : 1, .no_shrink_slab = mem_reliable_is_enabled() ? 0 : 1, @@ -3997,26 +3997,21 @@ static DECLARE_DEFERRABLE_WORK(shepherd, shrink_shepherd); static void shrink_shepherd(struct work_struct *w) { - int cpu; - get_online_cpus(); + int node; - for_each_online_cpu(cpu) { - struct delayed_work *work = &per_cpu(vmscan_work, cpu); - - if (!delayed_work_pending(work) && vm_cache_reclaim_enable) - queue_delayed_work_on(cpu, system_wq, work, 0); + for_each_online_node(node) { + if (!work_pending(&vmscan_works[node]) && vm_cache_reclaim_enable) + queue_work_node(node, system_unbound_wq, &vmscan_works[node]); } - put_online_cpus(); - /* we want all kernel thread to stop */ if (vm_cache_reclaim_enable) { if (vm_cache_reclaim_s == 0) - schedule_delayed_work(&shepherd, + queue_delayed_work(system_unbound_wq, &shepherd, round_jiffies_relative(120 * HZ)); else - schedule_delayed_work(&shepherd, + queue_delayed_work(system_unbound_wq, &shepherd, round_jiffies_relative((unsigned long) vm_cache_reclaim_s * HZ)); } @@ -4024,15 +4019,12 @@ static void shrink_shepherd(struct work_struct *w) static void shrink_shepherd_timer(void) { - int cpu; - - for_each_possible_cpu(cpu) { - struct delayed_work *work = &per_cpu(vmscan_work, cpu); + int i; - INIT_DEFERRABLE_WORK(work, shrink_page_cache_work); - } + for (i = 0; i < MAX_NUMNODES; i++) + INIT_WORK(&vmscan_works[i], shrink_page_cache_work); - schedule_delayed_work(&shepherd, + queue_delayed_work(system_unbound_wq, &shepherd, round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ)); } @@ -4048,9 +4040,6 @@ unsigned long shrink_page_cache(gfp_t mask) static void shrink_page_cache_work(struct work_struct *w) { - struct delayed_work *work = to_delayed_work(w); - unsigned long nr_pages; - /* * if vm_cache_reclaim_enable or vm_cache_reclaim_s is zero, * we do not shrink page cache again. @@ -4063,10 +4052,7 @@ static void shrink_page_cache_work(struct work_struct *w) return; /* It should wait more time if we hardly reclaim the page cache */ - nr_pages = shrink_page_cache(GFP_KERNEL); - if ((nr_pages < SWAP_CLUSTER_MAX) && vm_cache_reclaim_enable) - queue_delayed_work_on(smp_processor_id(), system_wq, work, - round_jiffies_relative((vm_cache_reclaim_s + 120) * HZ)); + shrink_page_cache(GFP_KERNEL); } static void shrink_page_cache_init(void) @@ -4088,13 +4074,6 @@ static void shrink_page_cache_init(void) shrink_shepherd_timer(); } -static int kswapd_cpu_down_prep(unsigned int cpu) -{ - cancel_delayed_work_sync(&per_cpu(vmscan_work, cpu)); - - return 0; -} - int cache_reclaim_enable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -4105,8 +4084,8 @@ int cache_reclaim_enable_handler(struct ctl_table *table, int write, return ret; if (write) - schedule_delayed_work(&shepherd, round_jiffies_relative( - (unsigned long)vm_cache_reclaim_s * HZ)); + queue_delayed_work(system_unbound_wq, &shepherd, + round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ)); return 0; } @@ -4121,7 +4100,7 @@ int cache_reclaim_sysctl_handler(struct ctl_table *table, int write, return ret; if (write) - mod_delayed_work(system_wq, &shepherd, + mod_delayed_work(system_unbound_wq, &shepherd, round_jiffies_relative( (unsigned long)vm_cache_reclaim_s * HZ)); @@ -4194,15 +4173,9 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); -#ifdef CONFIG_SHRINK_PAGECACHE - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "mm/vmscan:online", kswapd_cpu_online, - kswapd_cpu_down_prep); -#else ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmscan:online", kswapd_cpu_online, NULL); -#endif WARN_ON(ret < 0); #ifdef CONFIG_SHRINK_PAGECACHE shrink_page_cache_init(); -- Gitee From 1fbc8ee56564350edf4d89464cca29d0a0e4c92a Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Mon, 9 May 2022 11:30:09 +0000 Subject: [PATCH 319/340] mm: Update reliable flag in memory allocaion for reliable task only in task context hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Since interrupt may occupy reliable task's context and its current->flags will have PF_RELIABLE and this will lead to redirect it's memory allocation to mirrored region. Update reliable task's gfp flag can only happen in normal task context by checking in_task(). Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 38f2a84d3224..3ae924633e2c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4625,6 +4625,9 @@ static inline void prepare_before_alloc(gfp_t *gfp_mask) return; } + if (!in_task()) + return; + if (is_global_init(current) || (current->flags & PF_RELIABLE)) *gfp_mask |= ___GFP_RELIABILITY; } -- Gitee From 0ddc1e480f6deb139b4b2236cbdf7cafbbc0548e Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 12 May 2022 14:32:44 +0000 Subject: [PATCH 320/340] ext4: fix use-after-free in ext4_search_dir mainline inclusion from mainline-v5.18-rc4 commit c186f0887fe7061a35cebef024550ec33ef8fbd8 category: bugfix bugzilla: 186477, https://gitee.com/openeuler/kernel/issues/I55UHT CVE: NA ------------------------------------------------- We got issue as follows: EXT4-fs (loop0): mounted filesystem without journal. Opts: ,errors=continue ================================================================== BUG: KASAN: use-after-free in ext4_search_dir fs/ext4/namei.c:1394 [inline] BUG: KASAN: use-after-free in search_dirblock fs/ext4/namei.c:1199 [inline] BUG: KASAN: use-after-free in __ext4_find_entry+0xdca/0x1210 fs/ext4/namei.c:1553 Read of size 1 at addr ffff8881317c3005 by task syz-executor117/2331 CPU: 1 PID: 2331 Comm: syz-executor117 Not tainted 5.10.0+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:83 [inline] dump_stack+0x144/0x187 lib/dump_stack.c:124 print_address_description+0x7d/0x630 mm/kasan/report.c:387 __kasan_report+0x132/0x190 mm/kasan/report.c:547 kasan_report+0x47/0x60 mm/kasan/report.c:564 ext4_search_dir fs/ext4/namei.c:1394 [inline] search_dirblock fs/ext4/namei.c:1199 [inline] __ext4_find_entry+0xdca/0x1210 fs/ext4/namei.c:1553 ext4_lookup_entry fs/ext4/namei.c:1622 [inline] ext4_lookup+0xb8/0x3a0 fs/ext4/namei.c:1690 __lookup_hash+0xc5/0x190 fs/namei.c:1451 do_rmdir+0x19e/0x310 fs/namei.c:3760 do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x445e59 Code: 4d c7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 1b c7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fff2277fac8 EFLAGS: 00000246 ORIG_RAX: 0000000000000054 RAX: ffffffffffffffda RBX: 0000000000400280 RCX: 0000000000445e59 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000200000c0 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000002 R10: 00007fff2277f990 R11: 0000000000000246 R12: 0000000000000000 R13: 431bde82d7b634db R14: 0000000000000000 R15: 0000000000000000 The buggy address belongs to the page: page:0000000048cd3304 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1317c3 flags: 0x200000000000000() raw: 0200000000000000 ffffea0004526588 ffffea0004528088 0000000000000000 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881317c2f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8881317c2f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff8881317c3000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff8881317c3080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8881317c3100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== ext4_search_dir: ... de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { ... if ((char *) de + de->name_len <= dlimit && ext4_match(dir, fname, de)) { ... } ... de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) return -1; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } Assume: de=0xffff8881317c2fff dlimit=0x0xffff8881317c3000 If read 'de->name_len' which address is 0xffff8881317c3005, obviously is out of range, then will trigger use-after-free. To solve this issue, 'dlimit' must reserve 8 bytes, as we will read 'de->name_len' to judge if '(char *) de + de->name_len' out of range. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220324064816.1209985-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org conflicts: fs/ext4/namei.c Signed-off-by: ChenXiaoSong Reviewed-by: Zhang Yi Reviewed-by: yebin Signed-off-by: Yongqiang Liu --- fs/ext4/ext4.h | 4 ++++ fs/ext4/namei.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6787d12d5abf..16e2b719d47c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1985,6 +1985,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) * Structure of a directory entry */ #define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 867deea2fbdb..e3e57eab371f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1304,10 +1304,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; - while ((char *) de < dlimit) { + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit && + if (de->name + de->name_len <= dlimit && ext4_match(fname, de)) { /* found a match - just to be sure, do * a full check */ -- Gitee From e2aee86820958e18ca7e7dabf920700cdc48607c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 12 May 2022 14:32:45 +0000 Subject: [PATCH 321/340] ext4: unregister sysfs path before destroying jbd2 journal mainline inclusion from mainline-v5.7-rc1 commit 5e47868fb94b63c7068dcfd2469d99fba06bae9d category: bugfix bugzilla: 186675, https://gitee.com/openeuler/kernel/issues/I55TUC CVE: NA ------------------------------------------------- Call ext4_unregister_sysfs(), before destroying jbd2 journal, since below might cause, NULL pointer dereference issue. This got reported with LTP tests. ext4_put_super() cat /sys/fs/ext4/loop2/journal_task | ext4_attr_show(); ext4_jbd2_journal_destroy(); | | journal_task_show() | | | task_pid_vnr(NULL); sbi->s_journal = NULL; Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200318061301.4320-1-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o conflicts: fs/ext4/super.c Signed-off-by: ChenXiaoSong Reviewed-by: yebin Reviewed-by: Zhang Yi Signed-off-by: Yongqiang Liu --- fs/ext4/super.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f961f7d94a70..eb70be800507 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1160,6 +1160,13 @@ static void ext4_put_super(struct super_block *sb) flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); + /* + * Unregister sysfs before destroying jbd2 journal. + * Since we could still access attr_journal_task attribute via sysfs + * path which could have sbi->s_journal->j_task as NULL + */ + ext4_unregister_sysfs(sb); + if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); @@ -1169,7 +1176,6 @@ static void ext4_put_super(struct super_block *sb) } } - ext4_unregister_sysfs(sb); ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); -- Gitee From 84597fe1ff2df016d4d99f36a3f0bd9e7c6e823f Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 12 May 2022 14:32:46 +0000 Subject: [PATCH 322/340] ext4: fix bug_on in start_this_handle during umount filesystem mainline inclusion from mainline-v5.18-rc4 commit b98535d091795a79336f520b0708457aacf55c67 category: bugfix bugzilla: 186675, https://gitee.com/openeuler/kernel/issues/I55TUC CVE: NA ------------------------------------------------- We got issue as follows: ------------[ cut here ]------------ kernel BUG at fs/jbd2/transaction.c:389! invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 9 PID: 131 Comm: kworker/9:1 Not tainted 5.17.0-862.14.0.6.x86_64-00001-g23f87daf7d74-dirty #197 Workqueue: events flush_stashed_error_work RIP: 0010:start_this_handle+0x41c/0x1160 RSP: 0018:ffff888106b47c20 EFLAGS: 00010202 RAX: ffffed10251b8400 RBX: ffff888128dc204c RCX: ffffffffb52972ac RDX: 0000000000000200 RSI: 0000000000000004 RDI: ffff888128dc2050 RBP: 0000000000000039 R08: 0000000000000001 R09: ffffed10251b840a R10: ffff888128dc204f R11: ffffed10251b8409 R12: ffff888116d78000 R13: 0000000000000000 R14: dffffc0000000000 R15: ffff888128dc2000 FS: 0000000000000000(0000) GS:ffff88839d680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000001620068 CR3: 0000000376c0e000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: jbd2__journal_start+0x38a/0x790 jbd2_journal_start+0x19/0x20 flush_stashed_error_work+0x110/0x2b3 process_one_work+0x688/0x1080 worker_thread+0x8b/0xc50 kthread+0x26f/0x310 ret_from_fork+0x22/0x30 Modules linked in: ---[ end trace 0000000000000000 ]--- Above issue may happen as follows: umount read procfs error_work ext4_put_super flush_work(&sbi->s_error_work); ext4_mb_seq_groups_show ext4_mb_load_buddy_gfp ext4_mb_init_group ext4_mb_init_cache ext4_read_block_bitmap_nowait ext4_validate_block_bitmap ext4_error ext4_handle_error schedule_work(&EXT4_SB(sb)->s_error_work); ext4_unregister_sysfs(sb); jbd2_journal_destroy(sbi->s_journal); journal_kill_thread journal->j_flags |= JBD2_UNMOUNT; flush_stashed_error_work jbd2_journal_start start_this_handle BUG_ON(journal->j_flags & JBD2_UNMOUNT); To solve this issue, we call 'ext4_unregister_sysfs() before flushing s_error_work in ext4_put_super(). Signed-off-by: Ye Bin Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20220322012419.725457-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o conflicts: fs/ext4/super.c Signed-off-by: ChenXiaoSong Reviewed-by: Zhang Yi Reviewed-by: yebin Signed-off-by: Yongqiang Liu --- fs/ext4/super.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb70be800507..4d9de6216fe4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1154,19 +1154,24 @@ static void ext4_put_super(struct super_block *sb) int aborted = 0; int i, err; - ext4_unregister_li_request(sb); - ext4_quota_off_umount(sb); - - flush_work(&sbi->s_error_work); - destroy_workqueue(sbi->rsv_conversion_wq); - /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL + * Unregister sysfs before flush sbi->s_error_work. + * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If + * read metadata verify failed then will queue error work. + * flush_stashed_error_work will call start_this_handle may trigger + * BUG_ON. */ ext4_unregister_sysfs(sb); + ext4_unregister_li_request(sb); + ext4_quota_off_umount(sb); + + flush_work(&sbi->s_error_work); + destroy_workqueue(sbi->rsv_conversion_wq); + if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); -- Gitee From 5ba8a3eb026f02aa75f4a22099250bfd4db659cb Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 13 May 2022 02:20:37 +0000 Subject: [PATCH 323/340] ax25: add refcount in ax25_dev to avoid UAF bugs mainline inclusion from mainline-v5.17-rc3 commit d01ffb9eee4af165d83b08dd73ebdf9fe94a519b category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204 -------------------------------- If we dereference ax25_dev after we call kfree(ax25_dev) in ax25_dev_device_down(), it will lead to concurrency UAF bugs. There are eight syscall functions suffer from UAF bugs, include ax25_bind(), ax25_release(), ax25_connect(), ax25_ioctl(), ax25_getname(), ax25_sendmsg(), ax25_getsockopt() and ax25_info_show(). One of the concurrency UAF can be shown as below: (USE) | (FREE) | ax25_device_event | ax25_dev_device_down ax25_bind | ... ... | kfree(ax25_dev) ax25_fillin_cb() | ... ax25_fillin_cb_from_dev() | ... | The root cause of UAF bugs is that kfree(ax25_dev) in ax25_dev_device_down() is not protected by any locks. When ax25_dev, which there are still pointers point to, is released, the concurrency UAF bug will happen. This patch introduces refcount into ax25_dev in order to guarantee that there are no pointers point to it when ax25_dev is released. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller conflict: net/ax25/af_ax25.c net/ax25/ax25_dev.c Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- include/net/ax25.h | 10 ++++++++++ net/ax25/af_ax25.c | 2 ++ net/ax25/ax25_dev.c | 12 ++++++++++-- net/ax25/ax25_route.c | 3 +++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/net/ax25.h b/include/net/ax25.h index 8b7eb46ad72d..d81bfb674906 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -236,6 +236,7 @@ typedef struct ax25_dev { #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; } ax25_dev; typedef struct ax25_cb { @@ -290,6 +291,15 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } +#define ax25_dev_hold(__ax25_dev) \ + refcount_inc(&((__ax25_dev)->refcount)) + +static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) { + kfree(ax25_dev); + } +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 2b0bee2dbbb7..46c6065e377b 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -102,6 +102,7 @@ static void ax25_kill_by_device(struct net_device *dev) lock_sock(sk); ax25_disconnect(s, ENETUNREACH); s->ax25_dev = NULL; + ax25_dev_put(ax25_dev); release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); @@ -449,6 +450,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) } out_put: + ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret; diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index d92195cd7834..76d105390706 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -40,6 +40,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; + ax25_dev_hold(ax25_dev); } spin_unlock_bh(&ax25_dev_lock); @@ -59,6 +60,7 @@ void ax25_dev_device_up(struct net_device *dev) return; } + refcount_set(&ax25_dev->refcount, 1); dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; dev_hold(dev); @@ -86,6 +88,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; + ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); @@ -115,20 +118,22 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put(dev); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put(dev); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } @@ -136,6 +141,7 @@ void ax25_dev_device_down(struct net_device *dev) } spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; + ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) @@ -152,6 +158,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) if (ax25_dev->forward != NULL) return -EINVAL; ax25_dev->forward = fwd_dev->dev; + ax25_dev_put(fwd_dev); break; case SIOCAX25DELFWD: @@ -164,6 +171,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) return -EINVAL; } + ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 66d54fc11831..cd380767245c 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -119,6 +119,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; + ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); @@ -175,6 +176,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return 0; @@ -217,6 +219,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return err; } -- Gitee From 8726f0cd968784ff0c3d3ccd61a1e81faf14c09c Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 13 May 2022 02:20:38 +0000 Subject: [PATCH 324/340] ax25: fix reference count leaks of ax25_dev mainline inclusion from mainline-v5.17-rc3 commit 87563a043cef044fed5db7967a75741cc16ad2b1 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204 -------------------------------- The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") introduces refcount into ax25_dev, but there are reference leak paths in ax25_ctl_ioctl(), ax25_fwd_ioctl(), ax25_rt_add(), ax25_rt_del() and ax25_rt_opt(). This patch uses ax25_dev_put() and adjusts the position of ax25_addr_ax25dev() to fix reference cout leaks of ax25_dev. Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Signed-off-by: Duoming Zhou Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20220203150811.42256-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski conflict: net/ax25/ax25_dev.c Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- include/net/ax25.h | 8 +++++--- net/ax25/af_ax25.c | 12 ++++++++---- net/ax25/ax25_dev.c | 24 +++++++++++++++++------- net/ax25/ax25_route.c | 16 +++++++++++----- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/include/net/ax25.h b/include/net/ax25.h index d81bfb674906..aadff553e4b7 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -291,10 +291,12 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } -#define ax25_dev_hold(__ax25_dev) \ - refcount_inc(&((__ax25_dev)->refcount)) +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +} -static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 46c6065e377b..1fc954d34021 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -369,21 +369,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT; - if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) - return -ENODEV; - if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL; if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); + if (!ax25_dev) + return -ENODEV; + digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k]; - if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); + if (!ax25) { + ax25_dev_put(ax25_dev); return -ENOTCONN; + } switch (ax25_ctl.cmd) { case AX25_KILL: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 76d105390706..55a611f7239b 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -88,8 +88,8 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; - ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_hold(ax25_dev); ax25_register_dev_sysctl(ax25_dev); } @@ -118,8 +118,8 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put(dev); ax25_dev_put(ax25_dev); @@ -129,8 +129,8 @@ void ax25_dev_device_down(struct net_device *dev) while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put(dev); ax25_dev_put(ax25_dev); @@ -153,25 +153,35 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) switch (cmd) { case SIOCAX25ADDFWD: - if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + fwd_dev = ax25_addr_ax25dev(&fwd->port_to); + if (!fwd_dev) { + ax25_dev_put(ax25_dev); return -EINVAL; - if (ax25_dev->forward != NULL) + } + if (ax25_dev->forward) { + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: - if (ax25_dev->forward == NULL) + if (!ax25_dev->forward) { + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = NULL; + ax25_dev_put(ax25_dev); break; default: + ax25_dev_put(ax25_dev); return -EINVAL; } - ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index cd380767245c..8f81de88f006 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -78,11 +78,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_dev *ax25_dev; int i; - if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) - return -EINVAL; if (route->digi_count > AX25_MAX_DIGIS) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&route->port_addr); + if (!ax25_dev) + return -EINVAL; + write_lock_bh(&ax25_route_lock); ax25_rt = ax25_route_list; @@ -94,6 +96,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -104,6 +107,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } ax25_rt = ax25_rt->next; @@ -111,6 +115,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } @@ -119,11 +124,11 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; - ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -136,6 +141,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->next = ax25_route_list; ax25_route_list = ax25_rt; write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -176,8 +182,8 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -219,8 +225,8 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return err; } -- Gitee From 42688afbfc39f0d63b9c650d41f1ee7779e932f1 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 13 May 2022 02:20:39 +0000 Subject: [PATCH 325/340] ax25: fix UAF bugs of net_device caused by rebinding operation mainline inclusion from mainline-v5.17-rc4 commit feef318c855a361a1eccd880f33e88c460eb63b4 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204 -------------------------------- The ax25_kill_by_device() will set s->ax25_dev = NULL and call ax25_disconnect() to change states of ax25_cb and sock, if we call ax25_bind() before ax25_kill_by_device(). However, if we call ax25_bind() again between the window of ax25_kill_by_device() and ax25_dev_device_down(), the values and states changed by ax25_kill_by_device() will be reassigned. Finally, ax25_dev_device_down() will deallocate net_device. If we dereference net_device in syscall functions such as ax25_release(), ax25_sendmsg(), ax25_getsockopt(), ax25_getname() and ax25_info_show(), a UAF bug will occur. One of the possible race conditions is shown below: (USE) | (FREE) ax25_bind() | | ax25_kill_by_device() ax25_bind() | ax25_connect() | ... | ax25_dev_device_down() | ... | dev_put_track(dev, ...) //FREE ax25_release() | ... ax25_send_control() | alloc_skb() //USE | the corresponding fail log is shown below: BUG: KASAN: use-after-free in ax25_send_control+0x43/0x210 ... Call Trace: ... ax25_send_control+0x43/0x210 ax25_release+0x2db/0x3b0 __sock_release+0x6d/0x120 sock_close+0xf/0x20 __fput+0x11f/0x420 ... Allocated by task 1283: ... __kasan_kmalloc+0x81/0xa0 alloc_netdev_mqs+0x5a/0x680 mkiss_open+0x6c/0x380 tty_ldisc_open+0x55/0x90 ... Freed by task 1969: ... kfree+0xa3/0x2c0 device_release+0x54/0xe0 kobject_put+0xa5/0x120 tty_ldisc_kill+0x3e/0x80 ... In order to fix these UAF bugs caused by rebinding operation, this patch adds dev_hold_track() into ax25_bind() and corresponding dev_put_track() into ax25_kill_by_device(). Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller conflict: net/ax25/af_ax25.c The commit in mainline use dev_{hold/put}track, which 4.19 has not introduced, so use dev{hold/put} instead. Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 1fc954d34021..3769ffbe3302 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -102,6 +102,7 @@ static void ax25_kill_by_device(struct net_device *dev) lock_sock(sk); ax25_disconnect(s, ENETUNREACH); s->ax25_dev = NULL; + dev_put(ax25_dev->dev); ax25_dev_put(ax25_dev); release_sock(sk); spin_lock_bh(&ax25_list_lock); @@ -1123,8 +1124,10 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } } - if (ax25_dev != NULL) + if (ax25_dev) { ax25_fillin_cb(ax25, ax25_dev); + dev_hold(ax25_dev->dev); + } done: ax25_cb_add(ax25); -- Gitee From 3b5be7185ac2f86e7f2fb6c3b01217a56b0edd81 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 13 May 2022 02:20:40 +0000 Subject: [PATCH 326/340] ax25: Fix refcount leaks caused by ax25_cb_del() mainline inclusion from mainline-v5.18-rc1 commit 9fd75b66b8f68498454d685dc4ba13192ae069b0 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204 -------------------------------- The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") and commit feef318c855a ("ax25: fix UAF bugs of net_device caused by rebinding operation") increase the refcounts of ax25_dev and net_device in ax25_bind() and decrease the matching refcounts in ax25_kill_by_device() in order to prevent UAF bugs, but there are reference count leaks. The root cause of refcount leaks is shown below: (Thread 1) | (Thread 2) ax25_bind() | ... | ax25_addr_ax25dev() | ax25_dev_hold() //(1) | ... | dev_hold_track() //(2) | ... | ax25_destroy_socket() | ax25_cb_del() | ... | hlist_del_init() //(3) | | (Thread 3) | ax25_kill_by_device() | ... | ax25_for_each(s, &ax25_list) { | if (s->ax25_dev == ax25_dev) //(4) | ... | Firstly, we use ax25_bind() to increase the refcount of ax25_dev in position (1) and increase the refcount of net_device in position (2). Then, we use ax25_cb_del() invoked by ax25_destroy_socket() to delete ax25_cb in hlist in position (3) before calling ax25_kill_by_device(). Finally, the decrements of refcounts in ax25_kill_by_device() will not be executed, because no s->ax25_dev equals to ax25_dev in position (4). This patch adds decrements of refcounts in ax25_release() and use lock_sock() to do synchronization. If refcounts decrease in ax25_release(), the decrements of refcounts in ax25_kill_by_device() will not be executed and vice versa. Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Fixes: 87563a043cef ("ax25: fix reference count leaks of ax25_dev") Fixes: feef318c855a ("ax25: fix UAF bugs of net_device caused by rebinding operation") Reported-by: Thomas Osterried Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller conflict: net/ax25/af_ax25.c Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 3769ffbe3302..3f79e21cb2de 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -102,8 +102,10 @@ static void ax25_kill_by_device(struct net_device *dev) lock_sock(sk); ax25_disconnect(s, ENETUNREACH); s->ax25_dev = NULL; - dev_put(ax25_dev->dev); - ax25_dev_put(ax25_dev); + if (sk->sk_socket) { + dev_put(ax25_dev->dev); + ax25_dev_put(ax25_dev); + } release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); @@ -979,14 +981,20 @@ static int ax25_release(struct socket *sock) { struct sock *sk = sock->sk; ax25_cb *ax25; + ax25_dev *ax25_dev; if (sk == NULL) return 0; sock_hold(sk); - sock_orphan(sk); lock_sock(sk); + sock_orphan(sk); ax25 = sk_to_ax25(sk); + ax25_dev = ax25->ax25_dev; + if (ax25_dev) { + dev_put(ax25_dev->dev); + ax25_dev_put(ax25_dev); + } if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { -- Gitee From a1cbc83674b1528169c803031bb47dc12adc1a7a Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 13 May 2022 02:20:41 +0000 Subject: [PATCH 327/340] ax25: fix UAF bug in ax25_send_control() mainline inclusion from mainline-v5.18-rc1 commit 5352a761308397a0e6250fdc629bb3f615b94747 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204 -------------------------------- There are UAF bugs in ax25_send_control(), when we call ax25_release() to deallocate ax25_dev. The possible race condition is shown below: (Thread 1) | (Thread 2) ax25_dev_device_up() //(1) | | ax25_kill_by_device() ax25_bind() //(2) | ax25_connect() | ... ax25->state = AX25_STATE_1 | ... | ax25_dev_device_down() //(3) (Thread 3) ax25_release() | ax25_dev_put() //(4) FREE | case AX25_STATE_1: | ax25_send_control() | alloc_skb() //USE | The refcount of ax25_dev increases in position (1) and (2), and decreases in position (3) and (4). The ax25_dev will be freed before dereference sites in ax25_send_control(). The following is part of the report: [ 102.297448] BUG: KASAN: use-after-free in ax25_send_control+0x33/0x210 [ 102.297448] Read of size 8 at addr ffff888009e6e408 by task ax25_close/602 [ 102.297448] Call Trace: [ 102.303751] ax25_send_control+0x33/0x210 [ 102.303751] ax25_release+0x356/0x450 [ 102.305431] __sock_release+0x6d/0x120 [ 102.305431] sock_close+0xf/0x20 [ 102.305431] __fput+0x11f/0x420 [ 102.305431] task_work_run+0x86/0xd0 [ 102.307130] get_signal+0x1075/0x1220 [ 102.308253] arch_do_signal_or_restart+0x1df/0xc00 [ 102.308253] exit_to_user_mode_prepare+0x150/0x1e0 [ 102.308253] syscall_exit_to_user_mode+0x19/0x50 [ 102.308253] do_syscall_64+0x48/0x90 [ 102.308253] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 102.308253] RIP: 0033:0x405ae7 This patch defers the free operation of ax25_dev and net_device after all corresponding dereference sites in ax25_release() to avoid UAF. Fixes: 9fd75b66b8f6 ("ax25: Fix refcount leaks caused by ax25_cb_del()") Signed-off-by: Duoming Zhou Signed-off-by: Paolo Abeni Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Wei Yongjun Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 3f79e21cb2de..1fab69daa968 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -991,10 +991,6 @@ static int ax25_release(struct socket *sock) sock_orphan(sk); ax25 = sk_to_ax25(sk); ax25_dev = ax25->ax25_dev; - if (ax25_dev) { - dev_put(ax25_dev->dev); - ax25_dev_put(ax25_dev); - } if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { @@ -1056,6 +1052,10 @@ static int ax25_release(struct socket *sock) sk->sk_state_change(sk); ax25_destroy_socket(ax25); } + if (ax25_dev) { + dev_put(ax25_dev->dev); + ax25_dev_put(ax25_dev); + } sock->sk = NULL; release_sock(sk); -- Gitee From a5aba8243f662fb092a94b569794f619da398bbb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 May 2022 02:58:25 +0000 Subject: [PATCH 328/340] io_uring: fix race between timeout flush and removal stable inclusion from stable-v5.10.110 commit 2827328e646d0c2d3db1bfcad4b5f5016ce0d643 category: bugfix bugzilla: 186670,https://gitee.com/src-openeuler/kernel/issues/I54H78 CVE: CVE-2022-29582 -------------------------------- commit e677edbcabee849bfdd43f1602bccbecf736a646 upstream. io_flush_timeouts() assumes the timeout isn't in progress of triggering or being removed/canceled, so it unconditionally removes it from the timeout list and attempts to cancel it. Leave it on the list and let the normal timeout cancelation take care of it. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman Signed-off-by: Guo Xuenan Conflicts: fs/io_uring.c Reviewed-by: Zhang Yi Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a21d1d67d725..c104425b2557 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1332,6 +1332,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) static void io_flush_timeouts(struct io_ring_ctx *ctx) { + struct io_kiocb *req, *tmp; u32 seq; if (list_empty(&ctx->timeout_list)) @@ -1339,10 +1340,8 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - do { + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { u32 events_needed, events_got; - struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1359,9 +1358,8 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - list_del_init(&req->timeout.list); io_kill_timeout(req); - } while (!list_empty(&ctx->timeout_list)); + } ctx->cq_last_tm_flush = seq; } @@ -5213,6 +5211,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, else data->mode = HRTIMER_MODE_REL; + INIT_LIST_HEAD(&req->timeout.list); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); return 0; } @@ -6032,6 +6031,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) prev = NULL; } + list_del(&req->timeout.list); spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { -- Gitee From 009c22bb86b3d6214e48d0e05a1648ebf62d8416 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Fri, 13 May 2022 08:34:34 +0000 Subject: [PATCH 329/340] mm: clear_freelist_page: Provide timeout mechanism for worker runtime hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I558LD CVE: NA -------------------------------- In the case of large memory, the clear freelist will hold zone lock for a long time. As a result, the process may be blocked unless clear freelist thread exit, and causing the system to be reset by the watchdog. Provide a mechanism to stop clear freelist threads when elapsed time exceeds cfp_timeout, which can be set by module_param(). Signed-off-by: Yu Liao Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/clear_freelist_page.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/clear_freelist_page.c b/mm/clear_freelist_page.c index f0a13b8fdcdb..1ea59a3e527f 100644 --- a/mm/clear_freelist_page.c +++ b/mm/clear_freelist_page.c @@ -13,8 +13,10 @@ #include #include #include +#include #include +#define CFP_DEFAULT_TIMEOUT 2000 #define for_each_populated_zone_pgdat(pgdat, zone) \ for (zone = pgdat->node_zones; \ zone; \ @@ -32,6 +34,7 @@ static DECLARE_WAIT_QUEUE_HEAD(clear_freelist_wait); static DEFINE_MUTEX(clear_freelist_lock); static atomic_t clear_freelist_workers; static atomic_t clear_pages_num; +static ulong cfp_timeout_ms = CFP_DEFAULT_TIMEOUT; static int one = 1; /* @@ -51,15 +54,25 @@ static struct zone *next_pgdat_zone(struct zone *zone) static void clear_pgdat_freelist_pages(struct work_struct *work) { struct pgdat_entry *entry = container_of(work, struct pgdat_entry, work); + u64 cfp_timeout_ns = cfp_timeout_ms * NSEC_PER_MSEC; struct pglist_data *pgdat = entry->pgdat; unsigned long flags, order, t; struct page *page; struct zone *zone; + u64 start, now; + + start = sched_clock(); for_each_populated_zone_pgdat(pgdat, zone) { spin_lock_irqsave(&zone->lock, flags); for_each_migratetype_order(order, t) { list_for_each_entry(page, &zone->free_area[order].free_list[t], lru) { + now = sched_clock(); + if (unlikely(now - start > cfp_timeout_ns)) { + spin_unlock_irqrestore(&zone->lock, flags); + goto out; + } + #ifdef CONFIG_KMAP_LOCAL int i; @@ -77,6 +90,8 @@ static void clear_pgdat_freelist_pages(struct work_struct *work) cond_resched(); } + +out: kfree(entry); if (atomic_dec_and_test(&clear_freelist_workers)) @@ -170,3 +185,4 @@ static int __init clear_freelist_init(void) return 0; } module_init(clear_freelist_init); +module_param(cfp_timeout_ms, ulong, 0644); -- Gitee From da51b55eeff9ee6de1ad9724ed0f0f17ebc00aa9 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Fri, 13 May 2022 08:34:35 +0000 Subject: [PATCH 330/340] mm/share_pool: Support read-only memory allocation ascend inclusion category: Bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I578LV CVE: NA -------------------------------- When the driver uses the shared pool memory to share the memory with the user space, the user space is not allowed to operate this area. This prevents users from damaging sensitive data. When the sp_alloc and k2u processes apply for private memory, read-only memory can be applied for. Signed-off-by: Zhou Guanghui Reviewed-by: Weilong Chen Signed-off-by: Yongqiang Liu --- include/linux/share_pool.h | 3 ++- mm/share_pool.c | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 669d32f9a092..ba001069fbd2 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -13,6 +13,7 @@ #define SP_HUGEPAGE_ONLY (1 << 1) #define SP_DVPP (1 << 2) #define SP_SPEC_NODE_ID (1 << 3) +#define SP_PROT_RO (1 << 16) #define DEVICE_ID_BITS 4UL #define DEVICE_ID_MASK ((1UL << DEVICE_ID_BITS) - 1UL) @@ -22,7 +23,7 @@ #define NODE_ID_SHIFT (DEVICE_ID_SHIFT + DEVICE_ID_BITS) #define SP_FLAG_MASK (SP_HUGEPAGE | SP_HUGEPAGE_ONLY | SP_DVPP | \ - SP_SPEC_NODE_ID | \ + SP_SPEC_NODE_ID | SP_PROT_RO | \ (DEVICE_ID_MASK << DEVICE_ID_SHIFT) | \ (NODE_ID_MASK << NODE_ID_SHIFT)) diff --git a/mm/share_pool.c b/mm/share_pool.c index 833ecb7dd859..5ba353ddfabd 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2349,6 +2349,9 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, if (spg_node) prot = spg_node->prot; + if (ac->sp_flags & SP_PROT_RO) + prot = PROT_READ; + /* when success, mmap_addr == spa->va_start */ mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); if (IS_ERR_VALUE(mmap_addr)) { @@ -2373,6 +2376,10 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, ret = -EINVAL; goto unmap; } + + if (ac->sp_flags & SP_PROT_RO) + vma->vm_flags &= ~VM_MAYWRITE; + /* clean PTE_RDONLY flags or trigger SMMU event */ if (prot & PROT_WRITE) vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); @@ -2666,6 +2673,9 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, goto put_mm; } + if (kc && kc->sp_flags & SP_PROT_RO) + prot = PROT_READ; + ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); if (IS_ERR_VALUE(ret_addr)) { pr_debug("k2u mmap failed %lx\n", ret_addr); @@ -2678,6 +2688,9 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (prot & PROT_WRITE) vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + if (kc && kc->sp_flags & SP_PROT_RO) + vma->vm_flags &= ~VM_MAYWRITE; + if (is_vm_hugetlb_page(vma)) { ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); if (ret) { @@ -2729,6 +2742,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un struct sp_area *spa; struct spg_proc_stat *stat; unsigned long prot = PROT_READ | PROT_WRITE; + struct sp_k2u_context kc; down_write(&sp_group_sem); stat = sp_init_process_stat(current, current->mm, spg_none); @@ -2747,8 +2761,8 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un } spa->kva = kva; - - uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, NULL); + kc.sp_flags = sp_flags; + uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, &kc); __sp_area_drop(spa); if (IS_ERR(uva)) pr_err("remap k2u to task failed %ld\n", PTR_ERR(uva)); -- Gitee From fec3e16daebaabd2818c54f3405ce8923b2e0f3b Mon Sep 17 00:00:00 2001 From: Luo Jiaxing Date: Fri, 13 May 2022 08:34:36 +0000 Subject: [PATCH 331/340] scsi: hisi_sas: Don't fail IT nexus reset for Open Reject timeout mainline inclusion from mainline-v5.2-rc1 commit 246ea3c0ad02 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I538BF CVE: NA --------------------------------------------------------- Currently we call hisi_sas_softreset_ata_disk() in hisi_sas_I_T_nexus_reset(). If this fails for open reject reason, there is no reason to fail the IT nexus reset, so only fail for TMF_RESP_FUNC_FAILED. Some other strings spilled over multiple lines are reunited. Signed-off-by: Luo Jiaxing Signed-off-by: John Garry Signed-off-by: Martin K. Petersen Signed-off-by: Xingui Yang Reviewed-by: kang fenglong Signed-off-by: Yongqiang Liu --- drivers/scsi/hisi_sas/hisi_sas_main.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 32f55248c356..6ee5168a6bd8 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1277,8 +1277,7 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device, /* no error, but return the number of bytes of * underrun */ - dev_warn(dev, "abort tmf: task to dev %016llx " - "resp: 0x%x sts 0x%x underrun\n", + dev_warn(dev, "abort tmf: task to dev %016llx resp: 0x%x sts 0x%x underrun\n", SAS_ADDR(device->sas_addr), task->task_status.resp, task->task_status.stat); @@ -1293,10 +1292,16 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device, break; } - dev_warn(dev, "abort tmf: task to dev " - "%016llx resp: 0x%x status 0x%x\n", - SAS_ADDR(device->sas_addr), task->task_status.resp, - task->task_status.stat); + if (task->task_status.resp == SAS_TASK_COMPLETE && + task->task_status.stat == SAS_OPEN_REJECT) { + dev_warn(dev, "abort tmf: open reject failed\n"); + res = -EIO; + } else { + dev_warn(dev, "abort tmf: task to dev %016llx resp: 0x%x status 0x%x\n", + SAS_ADDR(device->sas_addr), + task->task_status.resp, + task->task_status.stat); + } sas_free_task(task); task = NULL; } @@ -1840,7 +1845,8 @@ static int hisi_sas_I_T_nexus_reset(struct domain_device *device) if (dev_is_sata(device)) { rc = hisi_sas_softreset_ata_disk(device); if (rc) - return TMF_RESP_FUNC_FAILED; + dev_err(dev, "I_T nexus reset: softreset failed (%d)\n", + rc); } rc = hisi_sas_debug_I_T_nexus_reset(device); @@ -2124,10 +2130,8 @@ _hisi_sas_internal_task_abort(struct hisi_hba *hisi_hba, } exit: - dev_dbg(dev, "internal task abort: task to dev %016llx task=%pK " - "resp: 0x%x sts 0x%x\n", - SAS_ADDR(device->sas_addr), - task, + dev_dbg(dev, "internal task abort: task to dev %016llx task=%p resp: 0x%x sts 0x%x\n", + SAS_ADDR(device->sas_addr), task, task->task_status.resp, /* 0 is complete, -1 is undelivered */ task->task_status.stat); sas_free_task(task); -- Gitee From 8fdd6417b411b8614315fa191de49775a35aa056 Mon Sep 17 00:00:00 2001 From: Xingui Yang Date: Fri, 13 May 2022 08:34:37 +0000 Subject: [PATCH 332/340] scsi: hisi_sas: Fix SAS disk sense info print incorrectly sometimes driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4ZO9V CVE: NA ---------------------------------- Sometimes disk response sense info, but driver print data underflow without sense, and it's not correct. we use scsi_normalize_sense instead of hisi_sas_get_sense_data to parse the sense info. before: data underflow without sense, rsp_code:0xf0, rc:-2. after: data underflow, rsp_code:0x70, sensekey:0x5, ASC:0x21, ASCQ:0x0. Signed-off-by: Xingui Yang Reviewed-by: kang fenglong Signed-off-by: Yongqiang Liu --- drivers/scsi/hisi_sas/hisi_sas.h | 7 +-- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 72 ++++++++------------------ 2 files changed, 23 insertions(+), 56 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index 2a70af0191af..f3bb10da8f86 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -572,12 +573,6 @@ struct hisi_sas_slot_dif_buf_table { struct hisi_sas_sge_dif_page sge_dif_page; }; -struct hisi_sas_sense_data { - int sense_key; - int add_sense_code; - int add_sense_code_qua; -}; - extern bool hisi_sas_debugfs_enable; extern struct dentry *hisi_sas_debugfs_dir; extern int skip_bus_flag; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 3c662fa5f6c0..a268a2ec686c 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -2291,56 +2291,24 @@ slot_err_v3_hw(struct hisi_hba *hisi_hba, struct sas_task *task, } } -static int hisi_sas_get_sense_data(struct ssp_response_iu *iu, - struct hisi_sas_sense_data *sense_data) -{ - int rc = 0; - - switch (iu->resp_data[0]) { - case 0x70: - case 0x71: { - if (iu->sense_data_len > 13) { - sense_data->sense_key = (iu->resp_data[2] & 0xF); - sense_data->add_sense_code = iu->resp_data[12]; - sense_data->add_sense_code_qua = iu->resp_data[13]; - } else - rc = -1; - break; - } - case 0x72: - case 0x73: { - if (iu->sense_data_len > 4) { - sense_data->sense_key = (iu->resp_data[1] & 0xF); - sense_data->add_sense_code = iu->resp_data[2]; - sense_data->add_sense_code_qua = iu->resp_data[3]; - } else - rc = -1; - break; - } - default: - rc = -2; - break; - } - - return rc; -} - static int ssp_need_spin_up(struct hisi_sas_slot *slot) { - int rc; - struct hisi_sas_sense_data sense_data; + bool rc; + int sb_len; + u8 *sense_buffer; + struct scsi_sense_hdr sshdr; struct ssp_response_iu *iu = hisi_sas_status_buf_addr_mem(slot) + sizeof(struct hisi_sas_err_record); - rc = hisi_sas_get_sense_data(iu, &sense_data); + sb_len = iu->sense_data_len; + sense_buffer = iu->sense_data; + rc = scsi_normalize_sense(sense_buffer, sb_len, &sshdr); /* * if the SAS disk response with ASC=04h, * ASCQ=11h, host should send NOTIFY primitive. */ - if (rc == 0 && - sense_data.add_sense_code == 0x4 && - sense_data.add_sense_code_qua == 0x11) + if (rc && sshdr.asc == 0x4 && sshdr.ascq == 0x11) return 1; return 0; @@ -2444,22 +2412,26 @@ slot_complete_v3_hw(struct hisi_hba *hisi_hba, struct hisi_sas_slot *slot) if ((error_info[3] & RX_DATA_LEN_UNDERFLOW_MSK) && (task->task_proto == SAS_PROTOCOL_SSP)) { /*print detail sense info when data underflow happened*/ - int rc; - struct hisi_sas_sense_data sense_data; + bool rc; + int sb_len; + u8 *sense_buffer; + struct scsi_sense_hdr sshdr; struct ssp_response_iu *iu = hisi_sas_status_buf_addr_mem(slot) + sizeof(struct hisi_sas_err_record); - rc = hisi_sas_get_sense_data(iu, &sense_data); - if (!rc) + sb_len = iu->sense_data_len; + sense_buffer = iu->sense_data; + rc = scsi_normalize_sense(sense_buffer, sb_len, &sshdr); + if (rc) dev_info(dev, "data underflow, rsp_code:0x%x, sensekey:0x%x, ASC:0x%x, ASCQ:0x%x.\n", - iu->resp_data[0], - sense_data.sense_key, - sense_data.add_sense_code, - sense_data.add_sense_code_qua); + sshdr.response_code, + sshdr.sense_key, + sshdr.asc, + sshdr.ascq); else - dev_info(dev, "data underflow without sense, rsp_code:0x%x, rc:%d.\n", - iu->resp_data[0], rc); + dev_info(dev, "data underflow without sense, rsp_code:0x%02x.\n", + iu->resp_data[0]); } if (unlikely(slot->abort)) { sas_task_abort(task); -- Gitee From 1bc9fed394dc8160191178e3250af71518ca073b Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Fri, 13 May 2022 08:34:38 +0000 Subject: [PATCH 333/340] scsi: hisi_sas: Change hisi_sas_control_phy() phyup timeout mainline inclusion from mainline-5.17-rc1 commit 512623de5239 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4UT5N CVE: NA ------------------------------------------- The time of phyup not only depends on the controller but also the type of disk connected. As an example, from experience, for some SATA disks the amount of time from reset/power-on to receive the D2H FIS for phyup can take upto and more than 10s sometimes. According to the specification of some SATA disks such as ST14000NM0018, the max time from power-on to ready is 30s. Based on this the current timeout of phyup at 2s which is not enough. So set the value as HISI_SAS_WAIT_PHYUP_TIMEOUT (30s) in hisi_sas_control_phy(). For v3 hw there is a pre-existing workaround for a HW bug, being that we issue a link reset when the OOB occurs but the phyup does not. The current phyup timeout is HISI_SAS_WAIT_PHYUP_TIMEOUT. So if this does occur from when issuing a phy enable or similar via hisi_sas_control_phy(), the subsequent HW workaround linkreset processing calls hisi_sas_control_phy(), but this will pend the original phy reset timing out, so it is safe. Link: https://lore.kernel.org/r/1645703489-87194-3-git-send-email-john.garry@huawei.com Signed-off-by: Xiang Chen Signed-off-by: John Garry Signed-off-by: Martin K. Petersen Signed-off-by: Xingui Yang conflict: drivers/scsi/hisi_sas/hisi_sas.h drivers/scsi/hisi_sas/hisi_sas_main.c Reviewed-by: Xingui Yang Reviewed-by: kang fenglong Signed-off-by: Yongqiang Liu --- drivers/scsi/hisi_sas/hisi_sas.h | 1 + drivers/scsi/hisi_sas/hisi_sas_main.c | 8 ++++++-- drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 3 +-- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 3 +-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index f3bb10da8f86..a791401be7b3 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -88,6 +88,7 @@ #define HISI_SAS_PROT_MASK (HISI_SAS_DIF_PROT_MASK | HISI_SAS_DIX_PROT_MASK) +#define HISI_SAS_WAIT_PHYUP_TIMEOUT (30 * HZ) #define CLEAR_ITCT_TIMEOUT 20 struct hisi_hba; diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 6ee5168a6bd8..93df1c6b94c7 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1783,6 +1783,7 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) int rc, reset_type = (sas_dev->dev_status == HISI_SAS_DEV_INIT || !dev_is_sata(device)) ? true : false; struct hisi_hba *hisi_hba = dev_to_hisi_hba(device); + struct device *dev = hisi_hba->dev; struct sas_ha_struct *sas_ha = &hisi_hba->sha; DECLARE_COMPLETION_ONSTACK(phyreset); @@ -1809,7 +1810,8 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) struct hisi_sas_phy *phy = container_of(sas_phy, struct hisi_sas_phy, sas_phy); /* Wait for I_T reset complete, time out after 2s */ - int ret = wait_for_completion_timeout(&phyreset, 2 * HZ); + int ret = wait_for_completion_timeout(&phyreset, + HISI_SAS_WAIT_PHYUP_TIMEOUT); unsigned long flags; spin_lock_irqsave(&phy->lock, flags); @@ -1818,8 +1820,10 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) spin_unlock_irqrestore(&phy->lock, flags); /* report PHY down if timed out */ - if (!ret) + if (!ret) { + dev_warn(dev, "phy%d reset timeout\n", sas_phy->id); hisi_sas_phy_down(hisi_hba, sas_phy->id, 0); + } } else if (sas_dev->dev_status != HISI_SAS_DEV_INIT) /* Sleep 2s, wait for I_T reset at expander env except fail */ if (!rc) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index 61ff065e5d9b..e6e489778014 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -2874,7 +2874,6 @@ static const struct hisi_sas_hw_error port_ecc_axi_error[] = { }, }; -#define WAIT_PHYUP_TIMEOUT_V2_HW 20 static void wait_phyup_timedout_v2_hw(struct timer_list *t) { struct hisi_sas_phy *phy = from_timer(phy, t, timer); @@ -2894,7 +2893,7 @@ static void phy_oob_ready_v2_hw(struct hisi_hba *hisi_hba, int phy_no) if (!timer_pending(&phy->timer)) { dev_dbg(dev, "phy%d OOB ready\n", phy_no); phy->timer.function = wait_phyup_timedout_v2_hw; - phy->timer.expires = jiffies + WAIT_PHYUP_TIMEOUT_V2_HW * HZ; + phy->timer.expires = jiffies + HISI_SAS_WAIT_PHYUP_TIMEOUT; add_timer(&phy->timer); } } diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index a268a2ec686c..baae04adc8e4 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -1890,7 +1890,6 @@ static void handle_chl_int2_v3_hw(struct hisi_hba *hisi_hba, int phy_no) hisi_sas_phy_write32(hisi_hba, phy_no, CHL_INT2, irq_value); } -#define WAIT_PHYUP_TIMEOUT_V3_HW 20 static void wait_phyup_timedout_v3_hw(struct timer_list *t) { struct hisi_sas_phy *phy = from_timer(phy, t, timer); @@ -1917,7 +1916,7 @@ static void handle_chl_int0_v3_hw(struct hisi_hba *hisi_hba, int phy_no) if (!timer_pending(&phy->timer)) { phy->timer.function = wait_phyup_timedout_v3_hw; phy->timer.expires = jiffies + - WAIT_PHYUP_TIMEOUT_V3_HW * HZ; + HISI_SAS_WAIT_PHYUP_TIMEOUT; add_timer(&phy->timer); } } -- Gitee From f0993522412d567c96241f1dc7d9f2bd03d501f0 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 17 May 2022 07:38:49 +0000 Subject: [PATCH 334/340] mm/memory.c: update the first page in clear_gigantic_page_chunk hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57SM0 CVE: NA -------------------------------- Patch "mm: parallelize clear_gigantic_page" make clear_gigantic_page to be parallelized. But forgot to update the first page which results the first page for each block is still the head page. Fix it by calculating the first page for each block. By the way, add a check to pointer p in order to prevent kernel panic. Fixes: ae0cd4d46ced ("mm: parallelize clear_gigantic_page") Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Yongqiang Liu --- mm/memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 98c2872f8415..8be034c0f10b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4721,14 +4721,15 @@ static int clear_gigantic_page_chunk(unsigned long start, unsigned long end, struct cgp_args *args) { struct page *base_page = args->base_page; - struct page *p = base_page; + struct page *p = mem_map_offset(base_page, start); unsigned long addr = args->addr; unsigned long i; might_sleep(); for (i = start; i < end; i++, p = mem_map_next(p, base_page, i)) { cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); + if (p) + clear_user_highpage(p, addr + i * PAGE_SIZE); } return KTASK_RETURN_SUCCESS; -- Gitee From 1b42f1e2dc66c60fa5146652a7a67b2121046edf Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 17 May 2022 07:38:50 +0000 Subject: [PATCH 335/340] drm/vgem: Close use-after-free race in vgem_gem_create stable inclusion from linux-4.19.242 commit df2c1f38939aabb8c6beca108f08b90f050b9ebc category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I574Y3 CVE: CVE-2022-1419 -------------------------------- commit 4b848f20eda5974020f043ca14bacf7a7e634fc8 upstream. There's two references floating around here (for the object reference, not the handle_count reference, that's a different thing): - The temporary reference held by vgem_gem_create, acquired by creating the object and released by calling drm_gem_object_put_unlocked. - The reference held by the object handle, created by drm_gem_handle_create. This one generally outlives the function, except if a 2nd thread races with a GEM_CLOSE ioctl call. So usually everything is correct, except in that race case, where the access to gem_object->size could be looking at freed data already. Which again isn't a real problem (userspace shot its feet off already with the race, we could return garbage), but maybe someone can exploit this as an information leak. Cc: Dan Carpenter Cc: Hillf Danton Reported-by: syzbot+0dc4444774d419e916c8@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Cc: Emil Velikov Cc: Daniel Vetter Cc: Sean Paul Cc: Chris Wilson Cc: Eric Anholt Cc: Sam Ravnborg Cc: Rob Clark Reviewed-by: Chris Wilson Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200202132133.1891846-1-daniel.vetter@ffwll.ch [OP: backport to 4.19: adjusted DRM_DEBUG() -> DRM_DEBUG_DRIVER()] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu Reviewed-by: Zhang Xiaoxu Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/gpu/drm/vgem/vgem_drv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index 4709f08f39e4..02f518837978 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -189,9 +189,10 @@ static struct drm_gem_object *vgem_gem_create(struct drm_device *dev, return ERR_CAST(obj); ret = drm_gem_handle_create(file, &obj->base, handle); - drm_gem_object_put_unlocked(&obj->base); - if (ret) + if (ret) { + drm_gem_object_put_unlocked(&obj->base); return ERR_PTR(ret); + } return &obj->base; } @@ -214,7 +215,9 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev, args->size = gem_object->size; args->pitch = pitch; - DRM_DEBUG_DRIVER("Created object of size %lld\n", size); + drm_gem_object_put_unlocked(gem_object); + + DRM_DEBUG_DRIVER("Created object of size %llu\n", args->size); return 0; } -- Gitee From 151284c8e9385a46bb01d69fd7484a7e552bce52 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 17 May 2022 11:35:55 +0000 Subject: [PATCH 336/340] ptrace: Check PTRACE_O_SUSPEND_SECCOMP permission on PTRACE_SEIZE stable inclusion from linux-4.19.238 commit b1f438f872dcda10a79e6aeaf06fd52dfb15a6ab category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I57IYD CVE: CVE-2022-30594 -------------------------------- commit ee1fee900537b5d9560e9f937402de5ddc8412f3 upstream. Setting PTRACE_O_SUSPEND_SECCOMP is supposed to be a highly privileged operation because it allows the tracee to completely bypass all seccomp filters on kernels with CONFIG_CHECKPOINT_RESTORE=y. It is only supposed to be settable by a process with global CAP_SYS_ADMIN, and only if that process is not subject to any seccomp filters at all. However, while these permission checks were done on the PTRACE_SETOPTIONS path, they were missing on the PTRACE_SEIZE path, which also sets user-specified ptrace flags. Move the permissions checks out into a helper function and let both ptrace_attach() and ptrace_setoptions() call it. Cc: stable@kernel.org Fixes: 13c4a90119d2 ("seccomp: add ptrace options for suspend/resume") Signed-off-by: Jann Horn Link: https://lkml.kernel.org/r/20220319010838.1386861-1-jannh@google.com Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman Signed-off-by: Li Huafei Reviewed-by: Xu Kuohai Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f32095d7aa5e..424a04cdda1d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -365,6 +365,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } +static int check_ptrace_options(unsigned long data) +{ + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; + + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + !IS_ENABLED(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + return 0; +} + static int ptrace_attach(struct task_struct *task, long request, unsigned long addr, unsigned long flags) @@ -376,8 +396,16 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) { if (addr != 0) goto out; + /* + * This duplicates the check in check_ptrace_options() because + * ptrace_attach() and ptrace_setoptions() have historically + * used different error codes for unknown ptrace options. + */ if (flags & ~(unsigned long)PTRACE_O_MASK) goto out; + retval = check_ptrace_options(flags); + if (retval) + return retval; flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); } else { flags = PT_PTRACED; @@ -650,22 +678,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { unsigned flags; + int ret; - if (data & ~(unsigned long)PTRACE_O_MASK) - return -EINVAL; - - if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { - if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || - !IS_ENABLED(CONFIG_SECCOMP)) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || - current->ptrace & PT_SUSPEND_SECCOMP) - return -EPERM; - } + ret = check_ptrace_options(data); + if (ret) + return ret; /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; -- Gitee From 7734aa05f292e4bb8b79d8617aa85db4fa4d0d1a Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 17 May 2022 11:35:56 +0000 Subject: [PATCH 337/340] ax25: Fix UAF bugs in ax25 timers stable inclusion from stable-v4.19.240 commit 3082f32c45465b692c314131c2a3657e0c23e09d bugzilla: 186594, https://gitee.com/src-openeuler/kernel/issues/I578X1 CVE: CVE-2022-1205 -------------------------------- commit 82e31755e55fbcea6a9dfaae5fe4860ade17cbc0 upstream. There are race conditions that may lead to UAF bugs in ax25_heartbeat_expiry(), ax25_t1timer_expiry(), ax25_t2timer_expiry(), ax25_t3timer_expiry() and ax25_idletimer_expiry(), when we call ax25_release() to deallocate ax25_dev. One of the UAF bugs caused by ax25_release() is shown below: (Thread 1) | (Thread 2) ax25_dev_device_up() //(1) | ... | ax25_kill_by_device() ax25_bind() //(2) | ax25_connect() | ... ax25_std_establish_data_link() | ax25_start_t1timer() | ax25_dev_device_down() //(3) mod_timer(&ax25->t1timer,..) | | ax25_release() (wait a time) | ... | ax25_dev_put(ax25_dev) //(4)FREE ax25_t1timer_expiry() | ax25->ax25_dev->values[..] //USE| ... ... | We increase the refcount of ax25_dev in position (1) and (2), and decrease the refcount of ax25_dev in position (3) and (4). The ax25_dev will be freed in position (4) and be used in ax25_t1timer_expiry(). The fail log is shown below: [ 106.116942] BUG: KASAN: use-after-free in ax25_t1timer_expiry+0x1c/0x60 [ 106.116942] Read of size 8 at addr ffff88800bda9028 by task swapper/0/0 [ 106.116942] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.17.0-06123-g0905eec574 [ 106.116942] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-14 [ 106.116942] Call Trace: ... [ 106.116942] ax25_t1timer_expiry+0x1c/0x60 [ 106.116942] call_timer_fn+0x122/0x3d0 [ 106.116942] __run_timers.part.0+0x3f6/0x520 [ 106.116942] run_timer_softirq+0x4f/0xb0 [ 106.116942] __do_softirq+0x1c2/0x651 ... This patch adds del_timer_sync() in ax25_release(), which could ensure that all timers stop before we deallocate ax25_dev. Signed-off-by: Duoming Zhou Signed-off-by: Paolo Abeni [OP: backport to 4.19: adjust context] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman Signed-off-by: Lu Wei Reviewed-by: Yue Haibing Reviewed-by: Xiu Jianfeng Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- net/ax25/af_ax25.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 1fab69daa968..4ecdde530265 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1053,6 +1053,11 @@ static int ax25_release(struct socket *sock) ax25_destroy_socket(ax25); } if (ax25_dev) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); dev_put(ax25_dev->dev); ax25_dev_put(ax25_dev); } -- Gitee From e10b517065fa5fc02bc791b10d830a19a84f271a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 17 May 2022 11:35:57 +0000 Subject: [PATCH 338/340] Revert "SUNRPC: attempt AF_LOCAL connect on setup" mainline inclusion from mainline-v5.18-rc6 commit a3d0562d4dc039bca39445e1cddde7951662e17d category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I57MRF CVE: NA ------------------------------------------- This reverts commit 7073ea8799a8cf73db60270986f14e4aae20fa80. We must not try to connect the socket while the transport is under construction, because the mechanisms to safely tear it down are not in place. As the code stands, we end up leaking the sockets on a connection error. Reported-by: wanghai (M) Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Wang Hai Reviewed-by: Zhang Xiaoxu Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- net/sunrpc/xprtsock.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f93d386ae923..38598edcd86e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2978,9 +2978,6 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) } xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); - ret = ERR_PTR(xs_local_setup_socket(transport)); - if (ret) - goto out_err; break; default: ret = ERR_PTR(-EAFNOSUPPORT); -- Gitee From 37c00cf3cbdafd97527feae0b8e52bf788435fef Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 17 May 2022 11:35:58 +0000 Subject: [PATCH 339/340] SUNRPC: Ensure that the gssproxy client can start in a connected state mainline inclusion from mainline-v5.18-rc7 commit fd13359f54ee854f00134abc6be32da94ec53dbf category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I57MRF CVE: NA ------------------------------------------- Ensure that the gssproxy client connects to the server from the gssproxy daemon process context so that the AF_LOCAL socket connection is done using the correct path and namespaces. Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust conflicts: include/linux/sunrpc/clnt.h net/sunrpc/clnt.c Signed-off-by: Wang Hai Reviewed-by: Zhang Xiaoxu Reviewed-by: Wei Yongjun Signed-off-by: Yongqiang Liu --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/auth_gss/gss_rpc_upcall.c | 1 + net/sunrpc/clnt.c | 37 ++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 10bcbea6e952..8aa865bce4f6 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -144,6 +144,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT (1UL << 8) #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) +#define RPC_CLNT_CREATE_CONNECTED (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 73dcda060335..60fb9529c6ad 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -111,6 +111,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9ac94c774335..dc58c227f37c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -79,6 +79,7 @@ static void call_connect_status(struct rpc_task *task); static __be32 *rpc_encode_header(struct rpc_task *task); static __be32 *rpc_verify_header(struct rpc_task *task); static int rpc_ping(struct rpc_clnt *clnt); +static int rpc_ping_noreply(struct rpc_clnt *clnt); static void rpc_register_client(struct rpc_clnt *clnt) { @@ -481,6 +482,12 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, rpc_shutdown_client(clnt); return ERR_PTR(err); } + } else if (args->flags & RPC_CLNT_CREATE_CONNECTED) { + int err = rpc_ping_noreply(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + return ERR_PTR(err); + } } clnt->cl_softrtry = 1; @@ -2545,6 +2552,10 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; +static const struct rpc_procinfo rpcproc_null_noreply = { + .p_encode = rpcproc_encode_null, +}; + static int rpc_ping(struct rpc_clnt *clnt) { struct rpc_message msg = { @@ -2557,6 +2568,32 @@ static int rpc_ping(struct rpc_clnt *clnt) return err; } +static int rpc_ping_noreply(struct rpc_clnt *clnt) +{ + struct rpc_message msg = { + .rpc_proc = &rpcproc_null_noreply, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = &rpc_default_ops, + .flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + }; + struct rpc_task *task; + int status; + + msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + put_rpccred(msg.rpc_cred); + return PTR_ERR(task); + } + status = task->tk_status; + rpc_put_task(task); + put_rpccred(msg.rpc_cred); + return status; +} + static struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, -- Gitee From 458d642311d9a39c8c68ab09e0fa60c528d78f7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20L=C3=B6hle?= Date: Tue, 17 May 2022 12:03:16 +0000 Subject: [PATCH 340/340] mmc: block: fix read single on recovery logic mainline inclusion from mainline-v5.17-rc5 commit 54309fde1a352ad2674ebba004a79f7d20b9f037 category: bugfix bugzilla: 186729, https://gitee.com/openeuler/kernel/issues/I578BN CVE: CVE-2022-20008 -------------------------------- On reads with MMC_READ_MULTIPLE_BLOCK that fail, the recovery handler will use MMC_READ_SINGLE_BLOCK for each of the blocks, up to MMC_READ_SINGLE_RETRIES times each. The logic for this is fixed to never report unsuccessful reads as success to the block layer. On command error with retries remaining, blk_update_request was called with whatever value error was set last to. In case it was last set to BLK_STS_OK (default), the read will be reported as success, even though there was no data read from the device. This could happen on a CRC mismatch for the response, a card rejecting the command (e.g. again due to a CRC mismatch). In case it was last set to BLK_STS_IOERR, the error is reported correctly, but no retries will be attempted. Fixes: 81196976ed946c ("mmc: block: Add blk-mq support") Cc: stable@vger.kernel.org Signed-off-by: Christian Loehle Reviewed-by: Adrian Hunter Link: https://lore.kernel.org/r/bc706a6ab08c4fe2834ba0c05a804672@hyperstone.com Signed-off-by: Ulf Hansson Conflict: commit 40c96853fef1 ("mmc: core: Enable re-use of mmc_blk_in_tran_state()") is not backported, mmc_ready_for_data() doesn't exist, use mmc_blk_in_tran_state() instead. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Reviewed-by: Xiu Jianfeng Signed-off-by: Yongqiang Liu --- drivers/mmc/core/block.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 60eac66dc9f0..890a21bc3e28 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1658,31 +1658,31 @@ static void mmc_blk_read_single(struct mmc_queue *mq, struct request *req) struct mmc_card *card = mq->card; struct mmc_host *host = card->host; blk_status_t error = BLK_STS_OK; - int retries = 0; do { u32 status; int err; + int retries = 0; - mmc_blk_rw_rq_prep(mqrq, card, 1, mq); + while (retries++ <= MMC_READ_SINGLE_RETRIES) { + mmc_blk_rw_rq_prep(mqrq, card, 1, mq); - mmc_wait_for_req(host, mrq); + mmc_wait_for_req(host, mrq); - err = mmc_send_status(card, &status); - if (err) - goto error_exit; - - if (!mmc_host_is_spi(host) && - !mmc_blk_in_tran_state(status)) { - err = mmc_blk_fix_state(card, req); + err = mmc_send_status(card, &status); if (err) goto error_exit; - } - if (mrq->cmd->error && retries++ < MMC_READ_SINGLE_RETRIES) - continue; + if (!mmc_host_is_spi(host) && + !mmc_blk_in_tran_state(status)) { + err = mmc_blk_fix_state(card, req); + if (err) + goto error_exit; + } - retries = 0; + if (!mrq->cmd->error) + break; + } if (mrq->cmd->error || mrq->data->error || -- Gitee