内核:5.9.0
流程图
generic_file_buffered_read一种调用路径(cat某个文件触发):
#0 ondemand_readahead (mapping=0xffff888005c61340, ra=0xffff888005927598, filp=0xffff888005927500, hit_readahead_marker=false, index=0, req_size=16) at mm/readahead.c:445
#1 0xffffffff812eeea1 in page_cache_sync_readahead (req_count=<optimized out>, index=<optimized out>, filp=<optimized out>, ra=<optimized out>, mapping=<optimized out>) at mm/readahead.c:585
#2 page_cache_sync_readahead (mapping=<optimized out>, ra=0xffff888005927598, filp=0xffff888005927500, index=<optimized out>, req_count=16) at mm/readahead.c:567
#3 0xffffffff812dcae7 in generic_file_buffered_read (iocb=0xffffc90000033cc0, iter=<optimized out>, written=0) at mm/filemap.c:2199
#4 0xffffffff812dd8ed in generic_file_read_iter (iocb=0xffffc90000033cc0, iter=0xffffc90000033c98) at mm/filemap.c:2507
#5 0xffffffff814c7fc9 in ext4_file_read_iter (to=<optimized out>, iocb=<optimized out>) at fs/ext4/file.c:131
#6 ext4_file_read_iter (iocb=0xffffc90000033cc0, to=0xffffc90000033c98) at fs/ext4/file.c:114
#7 0xffffffff81405c0f in call_read_iter (file=<optimized out>, iter=<optimized out>, kio=<optimized out>) at ./include/linux/fs.h:1876
#8 generic_file_splice_read (in=0xffff888005927500, ppos=0xffffc90000033da8, pipe=<optimized out>, len=<optimized out>, flags=<optimized out>) at fs/splice.c:312
#9 0xffffffff81407b51 in do_splice_to (in=0xffff888005927500, ppos=0xffffc90000033da8, pipe=0xffff8880058fb6c0, len=65536, flags=<optimized out>) at fs/splice.c:890
#10 0xffffffff81407cab in splice_direct_to_actor (in=<optimized out>, sd=0xffffc90000033e00, actor=<optimized out>) at fs/splice.c:970
#11 0xffffffff81408012 in do_splice_direct (in=<optimized out>, ppos=0xffffc90000033ea8, out=0xffff888005927400, opos=0xffffc90000033eb0, len=16777216, flags=<optimized out>) at fs/splice.c:1079
#12 0xffffffff813ae9b1 in do_sendfile (out_fd=<optimized out>, in_fd=<optimized out>, ppos=0x0 <fixed_percpu_data>, count=<optimized out>, max=<optimized out>) at fs/read_write.c:1548
#13 0xffffffff813af30b in __do_sys_sendfile64 (count=<optimized out>, offset=<optimized out>, in_fd=<optimized out>, out_fd=<optimized out>) at fs/read_write.c:1609
#14 __se_sys_sendfile64 (count=<optimized out>, offset=<optimized out>, in_fd=<optimized out>, out_fd=<optimized out>) at fs/read_write.c:1595
generic_file_buffered_read核心逻辑:
- 尝试从page cache(address_space数据结构)中查找,如果命中返回。
- 如果未在page cache中,readpage从磁盘读取数据到page cache中。
- 按需(不是所有场景下才会预读)预读(readahead),提升IO性能。
/**
* generic_file_buffered_read - generic file read routine
* @iocb: the iocb to read
* @iter: data destination
* @written: already copied
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
*
* 本段代码自己注释说逻辑实现的真的“丑陋”....
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*
* Return:
* * total number of bytes copied, including those the were already @written
* * negative error code if nothing was copied
*/
ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
//预读的数据结构,预读后面会有专门的文章分析
struct file_ra_state *ra = &filp->f_ra;
//文件读取位置,即在文件内偏移量,字节为单位
loff_t *ppos = &iocb->ki_pos;
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
int error = 0;
if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
//把文档按4K分成数据页,index代表是数据页索引
//|----index 0----|----index 1----|----index 2----|
// 4K 4K 4K
index = *ppos >> PAGE_SHIFT;
//上次预读的位置
prev_index = ra->prev_pos >> PAGE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_SIZE-1);
//iter->count 要读取得字节数,last_index是要读取得最后一个index
last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
//循环读取iter->count bytes数据(即应用程序要求读取的数据)
for (;;) {
struct page *page;
pgoff_t end_index;
loff_t isize;
unsigned long nr, ret;
cond_resched();
find_page:
if (fatal_signal_pending(current)) {
error = -EINTR;
goto out;
}
//mapping指向的address_space缓存中,根据index查找是否已经有缓存数据
page = find_get_page(mapping, index);
if (!page) {
//如果执行了不进行时机的磁盘io操作(必须在cache中)返回错误
if (iocb->ki_flags & IOCB_NOIO)
goto would_block;
//"同步预读",注意,这里同步并非真的同步等待,本质上也是向block layer
//提交一个io请求就返回了
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
page = find_get_page(mapping, index);
//如果没有成功(因为sync_readahead给page申请内存时,不会进入慢路径,所以
//有可能失败。失败后goto no_cache_page:分配page页面,并readpage进行磁盘io操作
if (unlikely(page == NULL))
goto no_cached_page;
}
//走到这里,代表readahead成功,也就是当前想读取的页面,已经被预读了
//理解这句话要看readahead机制和数据结构,PageReadahead page是怎么设置的
if (PageReadahead(page)) {
if (iocb->ki_flags & IOCB_NOIO) {
put_page(page);
goto out;
}
//触发一次异步read_ahead,因为异步读成功命中,重新取预读更多的页面,也会
//更新ra数据结构
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
//如果页面数据无效,比如系统I/O非常繁忙,上面异步读没有完成,这里就需要等待了
if (!PageUptodate(page)) {
/*
* See comment in do_read_cache_page on why
* wait_on_page_locked is used to avoid unnecessarily
* serialisations and why it's safe.
*/
//IOCB_WAITQ代表异步io,比如io_uring
if (iocb->ki_flags & IOCB_WAITQ) {
if (written) {
put_page(page);
goto out;
}
error = wait_on_page_locked_async(page,
iocb->ki_waitq);
} else {
//指定不等待,就返回错误
if (iocb->ki_flags & IOCB_NOWAIT) {
put_page(page);
goto would_block;
}
//block等待IO完成,systrace上显示进程block i/o就是等在此处。
error = wait_on_page_locked_killable(page);
}
if (unlikely(error))
goto readpage_error;
//上面的等待成功了
if (PageUptodate(page))
goto page_ok;
if (inode->i_blkbits == PAGE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
/* pipes can't handle partially uptodate pages */
if (unlikely(iov_iter_is_pipe(iter)))
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
/* Did it get truncated before we got the lock? */
if (!page->mapping)
goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
offset, iter->count))
goto page_not_up_to_date_locked;
unlock_page(page);
}
page_ok:
/*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_SHIFT;
if (unlikely(!isize || index > end_index)) {
put_page(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_SIZE;
if (index == end_index) {
nr = ((isize - 1) & ~PAGE_MASK) + 1;
if (nr <= offset) {
put_page(page);
goto out;
}
}
nr = nr - offset;
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (prev_index != index || offset != prev_offset)
mark_page_accessed(page);
prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
//数据copy给用户空间,iter->count会减去copy的数据大小
ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
prev_offset = offset;
put_page(page);
written += ret;
//如果iter->count = 0,即已经完成数据读取goto out跳出循环。
if (!iov_iter_count(iter))
goto out;
if (ret < nr) {
error = -EFAULT;
goto out;
}
continue;
page_not_up_to_date:
/* Get exclusive access to the page ... */
if (iocb->ki_flags & IOCB_WAITQ)
error = lock_page_async(page, iocb->ki_waitq);
else
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
put_page(page);
continue;
}
/* Did somebody else fill it already? */
if (PageUptodate(page)) {
unlock_page(page);
goto page_ok;
}
readpage:
//low-level page read,即调用readpage触发磁盘i/o
if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
unlock_page(page);
put_page(page);
goto would_block;
}
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
* PG_error will be set again if readpage fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
put_page(page);
error = 0;
goto find_page;
}
goto readpage_error;
}
if (!PageUptodate(page)) {
if (iocb->ki_flags & IOCB_WAITQ)
error = lock_page_async(page, iocb->ki_waitq);
else
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
* invalidate_mapping_pages got it
*/
unlock_page(page);
put_page(page);
goto find_page;
}
unlock_page(page);
shrink_readahead_size_eio(ra);
error = -EIO;
goto readpage_error;
}
unlock_page(page);
}
goto page_ok;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
put_page(page);
goto out;
no_cached_page:
//address space没有缓存页面,alloc page,然后插入cache中。
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
page = page_cache_alloc(mapping);
if (!page) {
error = -ENOMEM;
goto out;
}
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error) {
put_page(page);
if (error == -EEXIST) {
error = 0;
goto find_page;
}
goto out;
}
goto readpage;
}
would_block:
error = -EAGAIN;
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_SHIFT;
ra->prev_pos |= prev_offset;
*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
file_accessed(filp);
return written ? written : error;
}
重点提示:代码中为什么先触发一次sync_readahead,再触发一次async_readahead?
- 如果代码注释的一样,这两个函数命令很容易误解,其实这两个从应用调用的角度来看,都是”异步"的,以为他们都只是向block layer submit io,所以是异步,从代码上下文也能推测出来,比如sync_readahead调用完之后,紧接调用了find_get_page还是判断了page == NULL的情况,从这点就充分看出来sync_readahead也是异步调用。
- 如果系统io情况正常,100%顺序读取的情况下,sync_readahead调用一次,触发了预读,后面再顺序读就不会触发sync_readhead了,而是触发async_readahead,async_readahead触发的条件是要读取的page满足PageReadahead(page),即预读页面被命中了。就像网络协议栈中的滑动窗口,既然请求已经来到预读的窗口中,那么就要重新触发新的预读,预读更多的页面。
具体上面这段话不理解的话,参考我后面readahead算法介绍的文章。
wait_on_page_locked_killable返回时机
IO完成之前会一直等待IO完成,在IO完成后,块设备通过中断通知cpu。在中断处理函数中,会进一步触发BLOCK_SOFTIRQ。在软中断处理例程中,回调最终会触发bio->bi_end_io(对ext4来说是mpage_end_io,解锁(unlock_page)之前在锁定的页面,同时调用SetPageUptodate设置update状态,调用栈如下:
#0 SetPageUptodate (page=<optimized out>) at ./include/linux/page-flags.h:542
#1 __read_end_io (bio=0xffff88800607be40) at fs/ext4/readpage.c:85
#2 0xffffffff8150ed00 in mpage_end_io (bio=0xffff88800607be40) at fs/ext4/readpage.c:183
#3 0xffffffff8168859f in bio_endio (bio=0xffff88800607be40) at block/bio.c:1449
#4 0xffffffff8168f9d7 in req_bio_endio (error=<optimized out>, nbytes=<optimized out>, bio=<optimized out>, rq=<optimized out>) at block/blk-core.c:259
#5 blk_update_request (req=0xffff8880062fe040, error=0 '\000', nr_bytes=131072) at block/blk-core.c:1577
#6 0xffffffff816a2d6a in blk_mq_end_request (rq=0xffff8880062fe040, error=<optimized out>) at ./include/linux/blkdev.h:976
#7 0xffffffff81b780c9 in virtblk_request_done (req=0xffff8880062fe040) at drivers/block/virtio_blk.c:171
#8 0xffffffff8169e6fb in blk_done_softirq (h=<optimized out>) at block/blk-mq.c:586
#9 0xffffffff826000d1 in __do_softirq () at kernel/softirq.c:298
#10 0xffffffff82400f82 in asm_call_on_stack () at arch/x86/entry/entry_64.S:708
#11 0xffffffff810ea498 in __run_on_irqstack (func=<optimized out>) at ./arch/x86/include/asm/irq_stack.h:26
wait_on_page_locked_killable函数代码
static inline int wait_on_page_locked_killable(struct page *page)
{
if (!PageLocked(page))
return 0;
return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
int wait_on_page_bit_killable(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
}
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, bool lock)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
int ret = 0;
init_wait(wait);
wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
for (;;) {
spin_lock_irq(&q->lock);
if (likely(list_empty(&wait->entry))) {
__add_wait_queue_entry_tail(q, wait);
SetPageWaiters(page);
}
set_current_state(state);
spin_unlock_irq(&q->lock);
if (likely(test_bit(bit_nr, &page->flags))) {
io_schedule();
}
if (lock) {
if (!test_and_set_bit_lock(bit_nr, &page->flags))
break;
} else {
if (!test_bit(bit_nr, &page->flags))
break;
}
if (unlikely(signal_pending_state(state, current))) {
ret = -EINTR;
break;
}
}
finish_wait(q, wait);
/*
* A signal could leave PageWaiters set. Clearing it here if
* !waitqueue_active would be possible (by open-coding finish_wait),
* but still fail to catch it in the case of wait hash collision. We
* already can fail to clear wait hash collision cases, so don't
* bother with signals either.
*/
return ret;
}
IO没有完成前,PG_Locked一直置位,调用进入io_schedule,进程主动让出CPU,状态切入TASK_KILLABLE(同时也是一个TASK_UNINTERRUPTIBLE)。io_schedule会设置current->io_wait = 1,io等待期间,cpu idle期间检测到io_wait就会累计iowait的时间。
IO完成解锁页面,唤醒进程
static void __read_end_io(struct bio *bio)
{
struct page *page;
struct bio_vec *bv;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, iter_all) {
page = bv->bv_page;
/* PG_error was set if any post_read step failed */
if (bio->bi_status || PageError(page)) {
ClearPageUptodate(page);
/* will re-read again later */
ClearPageError(page);
} else {
//设置Uptodate状态
SetPageUptodate(page);
}
//解锁页面,会wakeup等待的线程
unlock_page(page);
}
if (bio->bi_private)
mempool_free(bio->bi_private, bio_post_read_ctx_pool);
bio_put(bio);
}
/**
* unlock_page - unlock a locked page
* @page: the page
*
* Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
* Note that this depends on PG_waiters being the sign bit in the byte
* that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
* clear the PG_locked bit and test PG_waiters at the same time fairly
* portably (architectures that do LL/SC can test any bit, while x86 can
* test the sign bit).
*/
void unlock_page(struct page *page)
{
BUILD_BUG_ON(PG_waiters != 7);
page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
wake_up_page_bit(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
mpage_end_io 回调是哪里赋值的 - ext4读文件为例
fs/ext4/readpage.c : ext4_mpage_readpages
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages)
{
...
for (; nr_pages; nr_pages--) {
...
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio->bi_private = ctx;
ext4_set_bio_ctx(inode, bio);
bio_set_op_attrs(bio, REQ_OP_READ, 0);
...
}
BUG_ON(pages && !list_empty(pages));
if (bio)
ext4_submit_bio_read(bio);
return 0;
}
参考文章:
Linux 通用块层 bio 详解 – 字节岛技术分享
Life of an ext4 write request - Ext4
深入分析Linux内核File cache机制(上篇) - 知乎
Linux内核中跟踪文件PageCache预读_读取