page cache是怎么回写到存储设备的?
回写线程一步步会把page cache变成bio,然后bio组织成request,最终request链接到resquest queue中,供块设备层使用;do_writepages通过a_ops->writepages会调用不同文件系统中在struct address_space_operations中实现的writepage函数来将page cache写到磁盘,在ufs文件系统中这个结构体是这样实现的
const struct address_space_operations ufs_aops = {.readpage = ufs_readpage,.writepage = ufs_writepage,.write_begin = ufs_write_begin,.write_end = generic_write_end,.bmap = ufs_bmap
};
ufs_writepage中一步步跟踪page cache是如何转成bio的
ufs_writepageblock_write_full_pageblock_write_full_page_endio__block_write_full_pagesubmit_bh_submit_bhsubmit_bio
在这里,page先是变成了buffer head,然后submit_bh又将buffer head变成了bio,再通过submit_bio来给块设备层处理
static int ufs_writepage(struct page *page, struct writeback_control *wbc)
{return block_write_full_page(page,ufs_getfrag_block,wbc);
}int block_write_full_page(struct page *page, get_block_t *get_block,struct writeback_control *wbc)
{return block_write_full_page_endio(page, get_block, wbc,end_buffer_async_write);
}
EXPORT_SYMBOL(block_write_full_page);int block_write_full_page_endio(struct page *page, get_block_t *get_block,struct writeback_control *wbc, bh_end_io_t *handler)
{struct inode * const inode = page->mapping->host;loff_t i_size = i_size_read(inode);const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;unsigned offset;/* Is the page fully inside i_size? */if (page->index < end_index)return __block_write_full_page(inode, page, get_block, wbc,handler);/* Is the page fully outside i_size? (truncate in progress) */offset = i_size & (PAGE_CACHE_SIZE-1);if (page->index >= end_index+1 || !offset) {/** The page may have dirty, unmapped buffers. For example,* they may have been added in ext3_writepage(). Make them* freeable here, so the page does not leak.*/do_invalidatepage(page, 0);unlock_page(page);return 0; /* don't care */}/** The page straddles i_size. It must be zeroed out on each and every* writepage invocation because it may be mmapped. "A file is mapped* in multiples of the page size. For a file that is not a multiple of* the page size, the remaining memory is zeroed when mapped, and* writes to that region are not written out to the file."*/zero_user_segment(page, offset, PAGE_CACHE_SIZE);return __block_write_full_page(inode, page, get_block, wbc, handler);
}
EXPORT_SYMBOL(block_write_full_page_endio);static int __block_write_full_page(struct inode *inode, struct page *page,get_block_t *get_block, struct writeback_control *wbc,bh_end_io_t *handler)
{int err;sector_t block;sector_t last_block;struct buffer_head *bh, *head;unsigned int blocksize, bbits;int nr_underway = 0;int write_op = (wbc->sync_mode == WB_SYNC_ALL ?WRITE_SYNC : WRITE);head = create_page_buffers(page, inode,(1 << BH_Dirty)|(1 << BH_Uptodate));/** Be very careful. We have no exclusion from __set_page_dirty_buffers* here, and the (potentially unmapped) buffers may become dirty at* any time. If a buffer becomes dirty here after we've inspected it* then we just miss that fact, and the page stays dirty.** Buffers outside i_size may be dirtied by __set_page_dirty_buffers;* handle that here by just cleaning them.*/bh = head;blocksize = bh->b_size;bbits = block_size_bits(blocksize);block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);last_block = (i_size_read(inode) - 1) >> bbits;/** Get all the dirty buffers mapped to disk addresses and* handle any aliases from the underlying blockdev's mapping.*/do {if (block > last_block) {/** mapped buffers outside i_size will occur, because* this page can be outside i_size when there is a* truncate in progress.*//** The buffer was zeroed by block_write_full_page()*/clear_buffer_dirty(bh);set_buffer_uptodate(bh);} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&buffer_dirty(bh)) {WARN_ON(bh->b_size != blocksize);err = get_block(inode, block, bh, 1);if (err)goto recover;clear_buffer_delay(bh);if (buffer_new(bh)) {/* blockdev mappings never come here */clear_buffer_new(bh);unmap_underlying_metadata(bh->b_bdev,bh->b_blocknr);}}bh = bh->b_this_page;block++;} while (bh != head);do {if (!buffer_mapped(bh))continue;/** If it's a fully non-blocking write attempt and we cannot* lock the buffer then redirty the page. Note that this can* potentially cause a busy-wait loop from writeback threads* and kswapd activity, but those code paths have their own* higher-level throttling.*/if (wbc->sync_mode != WB_SYNC_NONE) {lock_buffer(bh);} else if (!trylock_buffer(bh)) {redirty_page_for_writepage(wbc, page);continue;}if (test_clear_buffer_dirty(bh)) {mark_buffer_async_write_endio(bh, handler);} else {unlock_buffer(bh);}} while ((bh = bh->b_this_page) != head);/** The page and its buffers are protected by PageWriteback(), so we can* drop the bh refcounts early.*/BUG_ON(PageWriteback(page));set_page_writeback(page);do {struct buffer_head *next = bh->b_this_page;if (buffer_async_write(bh)) {submit_bh(write_op, bh);nr_underway++;}bh = next;} while (bh != head);unlock_page(page);err = 0;
done:if (nr_underway == 0) {/** The page was marked dirty, but the buffers were* clean. Someone wrote them back by hand with* ll_rw_block/submit_bh. A rare case.*/end_page_writeback(page);/** The page and buffer_heads can be released at any time from* here on.*/}return err;recover:/** ENOSPC, or some other error. We may already have added some* blocks to the file, so we need to write these out to avoid* exposing stale data.* The page is currently locked and not marked for writeback*/bh = head;/* Recovery: lock and submit the mapped buffers */do {if (buffer_mapped(bh) && buffer_dirty(bh) &&!buffer_delay(bh)) {lock_buffer(bh);mark_buffer_async_write_endio(bh, handler);} else {/** The buffer may have been set dirty during* attachment to a dirty page.*/clear_buffer_dirty(bh);}} while ((bh = bh->b_this_page) != head);SetPageError(page);BUG_ON(PageWriteback(page));mapping_set_error(page->mapping, err);set_page_writeback(page);do {struct buffer_head *next = bh->b_this_page;if (buffer_async_write(bh)) {clear_buffer_dirty(bh);submit_bh(write_op, bh);nr_underway++;}bh = next;} while (bh != head);unlock_page(page);goto done;
}int submit_bh(int rw, struct buffer_head *bh)
{return _submit_bh(rw, bh, 0);
}
EXPORT_SYMBOL(submit_bh);int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
{struct bio *bio;int ret = 0;BUG_ON(!buffer_locked(bh));BUG_ON(!buffer_mapped(bh));BUG_ON(!bh->b_end_io);BUG_ON(buffer_delay(bh));BUG_ON(buffer_unwritten(bh));/** Only clear out a write error when rewriting*/if (test_set_buffer_req(bh) && (rw & WRITE))clear_buffer_write_io_error(bh);/** from here on down, it's all bio -- do the initial mapping,* submit_bio -> generic_make_request may further map this bio around*/bio = bio_alloc(GFP_NOIO, 1);bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);bio->bi_bdev = bh->b_bdev;bio->bi_io_vec[0].bv_page = bh->b_page;bio->bi_io_vec[0].bv_len = bh->b_size;bio->bi_io_vec[0].bv_offset = bh_offset(bh);bio->bi_vcnt = 1;bio->bi_size = bh->b_size;bio->bi_end_io = end_bio_bh_io_sync;bio->bi_private = bh;bio->bi_flags |= bio_flags;/* Take care of bh's that straddle the end of the device */guard_bh_eod(rw, bio, bh);if (buffer_meta(bh))rw |= REQ_META;if (buffer_prio(bh))rw |= REQ_PRIO;bio_get(bio);submit_bio(rw, bio);if (bio_flagged(bio, BIO_EOPNOTSUPP))ret = -EOPNOTSUPP;bio_put(bio);return ret;
}
EXPORT_SYMBOL_GPL(_submit_bh);
最终通过submit_bio来提交bio ,并通过generic_make_request来合成request
void submit_bio(int rw, struct bio *bio)
{bio->bi_rw |= rw;/** If it's a regular read/write or a barrier with data attached,* go through the normal accounting stuff before submission.*/if (bio_has_data(bio)) {unsigned int count;if (unlikely(rw & REQ_WRITE_SAME))count = bdev_logical_block_size(bio->bi_bdev) >> 9;elsecount = bio_sectors(bio);if (rw & WRITE) {count_vm_events(PGPGOUT, count);} else {task_io_account_read(bio->bi_size);count_vm_events(PGPGIN, count);}if (unlikely(block_dump)) {char b[BDEVNAME_SIZE];printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",current->comm, task_pid_nr(current),(rw & WRITE) ? "WRITE" : "READ",(unsigned long long)bio->bi_sector,bdevname(bio->bi_bdev, b),count);}}generic_make_request(bio);
}
EXPORT_SYMBOL(submit_bio);void generic_make_request(struct bio *bio)
{struct bio_list bio_list_on_stack;if (!generic_make_request_checks(bio))return;/** We only want one ->make_request_fn to be active at a time, else* stack usage with stacked devices could be a problem. So use* current->bio_list to keep a list of requests submited by a* make_request_fn function. current->bio_list is also used as a* flag to say if generic_make_request is currently active in this* task or not. If it is NULL, then no make_request is active. If* it is non-NULL, then a make_request is active, and new requests* should be added at the tail*/if (current->bio_list) {bio_list_add(current->bio_list, bio);return;}/* following loop may be a bit non-obvious, and so deserves some* explanation.* Before entering the loop, bio->bi_next is NULL (as all callers* ensure that) so we have a list with a single bio.* We pretend that we have just taken it off a longer list, so* we assign bio_list to a pointer to the bio_list_on_stack,* thus initialising the bio_list of new bios to be* added. ->make_request() may indeed add some more bios* through a recursive call to generic_make_request. If it* did, we find a non-NULL value in bio_list and re-enter the loop* from the top. In this case we really did just take the bio* of the top of the list (no pretending) and so remove it from* bio_list, and call into ->make_request() again.*/BUG_ON(bio->bi_next);bio_list_init(&bio_list_on_stack);current->bio_list = &bio_list_on_stack;do {struct request_queue *q = bdev_get_queue(bio->bi_bdev);q->make_request_fn(q, bio);bio = bio_list_pop(current->bio_list);} while (bio);current->bio_list = NULL; /* deactivate */
}
EXPORT_SYMBOL(generic_make_request);
generic_make_request里的make_request_fn函数是在blk_queue_init的时候注册的回调函数blk_queue_io;在blk_init_queue这个函数中会初始化request_fn和make_request_fn这两个回调函数
blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)blk_init_queue_nodeblk_init_allocated_queueq->request_fn = rfn; //在mmc子系统中,mmc_request_fn作为blk_init_queue的第一个参数而被赋值给request_fn成员blk_queue_make_request(q, blk_queue_bio);q->make_request_fn = mfn; //默认初始化为blk_queue_bio
blk_queue_bio先调用add_acct_request,里面通过I/O电梯调度算法将request插到request_queue中,电梯调度算法有3种: cfq, deadline, noop,然后他调用__blk_run_queue,里面会调用之前块设备中注册的request处理回调函数q->request_fn来在块设备驱动中处理这个request
blk_queue_bioadd_acct_request__elv_add_requestq->elevator->type->ops.elevator_add_req_fn(q, rq);__blk_run_queue__blk_run_queue_uncondq->request_fn
比如mmc子系统中这个request_fn是被初始化成mmc_request_fn,先唤醒queue的处理线程,然后通过blk_fetch_request从queue获取一个未处理的请求,最后通过控制器的操作函数里的request成员来将;缓存在page的数据转换成bio,并组织成的request,回写到mmc设备
mmc_request_fn wake_up_process(mq->thread);mmc_queue_threadreq = blk_fetch_request(q);mq->issue_fn(mq, req);mmc_blk_issue_rqmmc_start_reqhost->ops->request(host, mrq); //各平台对requst收发的具体实现