結構體
fd
fd也就是文件描述符,用于標識已經打開的文件、管道、socket等。是進程和內核的橋梁,允許進程執行各種文件操作
struct fd {struct file *file;unsigned int flags;
};
file
Linux內核中表示打開文件的結構體,包含了文件操作所需的各種信息和元數據。這是文件系統操作的核心結構之一,允許內核跟蹤每個打開的文件及其相關的狀態。
struct file {// 用于鏈接或者引用計數union {// 鏈表節點struct llist_node fu_llist;// Read-Copy-Update頭struct rcu_head fu_rcuhead;} f_u;// 文件路徑信息struct path f_path;// 文件的 inode 結構體,表示文件的具體內容和屬性struct inode *f_inode; /* cached value */// 指向文件操作結構體的指針,包含與文件相關的各種操作函數指針,如讀、寫、打開、關閉等const struct file_operations *f_op;/** Protects f_ep_links, f_flags.* Must not be taken from IRQ context.*/spinlock_t f_lock;enum rw_hint f_write_hint;// 引用計數,表示有多少引用指向這個文件結構體atomic_long_t f_count;// 文件標志,描述文件的各種屬性,如只讀、只寫、非阻塞等unsigned int f_flags;// 文件模式,指示文件的打開模式,如讀、寫、執行等fmode_t f_mode;// 位置鎖,用于保護文件讀寫位置的鎖struct mutex f_pos_lock;// 文件的讀寫位置偏移量loff_t f_pos;// 文件所有者結構體,包含文件的擁有者和訪問權限信息struct fown_struct f_owner;const struct cred *f_cred;// 文件預讀取狀態結構體,包含文件預讀取的相關信息struct file_ra_state f_ra;// 文件版本號,表示文件的版本信息。u64 f_version;
#ifdef CONFIG_SECURITYvoid *f_security;
#endif/* needed for tty driver, and maybe others */void *private_data;#ifdef CONFIG_EPOLL/* Used by fs/eventpoll.c to link all the hooks to this file */// 用于事件輪詢(epoll)系統調用的鏈表結struct list_head f_ep_links;struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */// 地址空間指針,表示文件的內存映射狀態struct address_space *f_mapping;// 寫回錯誤序列號,用于跟蹤文件寫回操作的錯誤errseq_t f_wb_err;
} __randomize_layout__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
inode
inode包含文件的所有元數據,支撐訪問控制、文件操作、同步、狀態管理和特定文件類型支持
/** Keep mostly read-only and often accessed (especially for* the RCU path lookup and 'stat' data) fields at the beginning* of the 'struct inode'*/
struct inode {// 文件的模式,包括文件類型和文件權限umode_t i_mode;// 操作標志,標識文件系統特定的操作unsigned short i_opflags;// 文件所有者的用戶 IDkuid_t i_uid;// 文件所有者的組 IDkgid_t i_gid;// 文件標志unsigned int i_flags;#ifdef CONFIG_FS_POSIX_ACLstruct posix_acl *i_acl;struct posix_acl *i_default_acl;
#endif// 指向 inode 操作函數的指針const struct inode_operations *i_op;// 指向文件系統超級塊struct super_block *i_sb;// 地址空間,描述文件內容在內存中的映射struct address_space *i_mapping;#ifdef CONFIG_SECURITYvoid *i_security;
#endif/* Stat data, not accessed from path walking */// inode 號,唯一標識一個文件unsigned long i_ino;/** Filesystems may only read i_nlink directly. They shall use the* following functions for modification:** (set|clear|inc|drop)_nlink* inode_(inc|dec)_link_count*/// 鏈接數,表示有多少個目錄項指向此 inodeunion {const unsigned int i_nlink;unsigned int __i_nlink;};// 設備號,對于設備文件有效dev_t i_rdev;// 文件大小loff_t i_size;struct timespec64 i_atime;struct timespec64 i_mtime;struct timespec64 i_ctime;spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */// 文件字節數、塊大小位數、寫入提示、文件占用塊數unsigned short i_bytes;u8 i_blkbits;u8 i_write_hint;blkcnt_t i_blocks;#ifdef __NEED_I_SIZE_ORDEREDseqcount_t i_size_seqcount;
#endif/* Misc */// 文件狀態unsigned long i_state;// 讀寫信號量,用于同步struct rw_semaphore i_rwsem;unsigned long dirtied_when; /* jiffies of first dirtying */unsigned long dirtied_time_when;// 特定結構:hash鏈表節點、IO列表struct hlist_node i_hash;struct list_head i_io_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACKstruct bdi_writeback *i_wb; /* the associated cgroup wb *//* foreign inode detection, see wbc_detach_inode() */int i_wb_frn_winner;u16 i_wb_frn_avg_time;u16 i_wb_frn_history;
#endif// 用于緩存回收的LRU列表struct list_head i_lru; /* inode LRU list */// 用于管理同一超級塊中的 inode的超級塊鏈表struct list_head i_sb_list;// 用于寫回緩沖的寫回列表struct list_head i_wb_list; /* backing dev writeback list */union {struct hlist_head i_dentry;struct rcu_head i_rcu;};// inode版本號atomic64_t i_version;// 引用計數atomic_t i_count;// 直接IO計數atomic_t i_dio_count;// 寫操作計數atomic_t i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)atomic_t i_readcount; /* struct files open RO */
#endifunion {const struct file_operations *i_fop; /* former ->i_op->default_file_ops */void (*free_inode)(struct inode *);};struct file_lock_context *i_flctx;struct address_space i_data;struct list_head i_devices;union {struct pipe_inode_info *i_pipe;struct block_device *i_bdev;struct cdev *i_cdev;char *i_link;unsigned i_dir_seq;};__u32 i_generation;#ifdef CONFIG_FSNOTIFY__u32 i_fsnotify_mask; /* all events this inode cares about */struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
#endif#ifdef CONFIG_FS_ENCRYPTIONstruct fscrypt_info *i_crypt_info;
#endif#ifdef CONFIG_FS_VERITYstruct fsverity_info *i_verity_info;
#endifvoid *i_private; /* fs or device private pointer */
} __randomize_layout;
寫入——從write()到vfs
write()系統調用在內核的實現為sys_write。
本部分在真正文件系統操作調用之外,只是獲取釋放文件描述符、更新位置指針、寫入前檢查等操作
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{// 獲取文件描述符fdstruct fd f = fdget_pos(fd);ssize_t ret = -EBADF;if (f.file) {// 獲取文件當前位置指針loff_t pos, *ppos = file_ppos(f.file);if (ppos) {pos = *ppos;ppos = &pos;}// VFS執行實際寫操作ret = vfs_write(f.file, buf, count, ppos);// 更新文件指針位置if (ret >= 0 && ppos)f.file->f_pos = pos;// 釋放文件描述符,減少其引用計數fdput_pos(f);}return ret;
}
接著進入vfs,vfs實際也是調用真正文件系統的接口實現
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{ssize_t ret;// 檢查文件是否可寫if (!(file->f_mode & FMODE_WRITE))return -EBADF; // 文件不可寫,返回錯誤碼 EBADFif (!(file->f_mode & FMODE_CAN_WRITE))return -EINVAL; // 文件模式不支持寫操作,返回錯誤碼 EINVAL// 檢查用戶空間指針是否有效if (unlikely(!access_ok(buf, count)))return -EFAULT; // 用戶空間指針無效,返回錯誤碼 EFAULT// 驗證寫操作范圍ret = rw_verify_area(WRITE, file, pos, count);if (!ret) {// 限制最大寫入字節數if (count > MAX_RW_COUNT)count = MAX_RW_COUNT;// 開始文件寫入file_start_write(file);// 實際執行寫操作ret = __vfs_write(file, buf, count, pos);// 如果寫入成功,發送文件系統通知并更新寫字節數if (ret > 0) {fsnotify_modify(file);add_wchar(current, ret);}// 更新系統調用寫計數inc_syscw(current);// 結束文件寫入file_end_write(file);}return ret;
}
static ssize_t __vfs_write(struct file *file, const char __user *p,size_t count, loff_t *pos)
{// 首先檢查文件操作結構是否有write方法,有直接用if (file->f_op->write)return file->f_op->write(file, p, count, pos);else if (file->f_op->write_iter)return new_sync_write(file, p, count, pos);elsereturn -EINVAL;
}
以下是ext4文件系統實現vfs接口的方法
const struct file_operations ext4_file_operations = {.llseek = ext4_llseek,.read_iter = ext4_file_read_iter,.write_iter = ext4_file_write_iter,.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT.compat_ioctl = ext4_compat_ioctl,
#endif.mmap = ext4_file_mmap,.mmap_supported_flags = MAP_SYNC,.open = ext4_file_open,.release = ext4_release_file,.fsync = ext4_sync_file,.get_unmapped_area = thp_get_unmapped_area,.splice_read = generic_file_splice_read,.splice_write = iter_file_splice_write,.fallocate = ext4_fallocate,
};
ext4 buffered or direct
在Linux中存在幾種不同的IO寫入方式
-
DAX: 字節級別的操作。要求額外的硬件支持
-
DIO:直接從用戶態寫入數據到硬盤中,跳過內核緩沖區,減少了上下文切換和數據復制開銷
塊級別操作,數據的讀寫需要是設備的塊大小和linux系統的頁大小的整數倍
-
BIO:默認標準方式。數據會先從應用程序的地址空間拷貝到 操作系統內核地址空間的頁緩存,然后再寫入磁盤。根據Linux的延遲寫機制,當數據寫到操作系統內核地址空間的頁緩存就意味write
緩沖寫入操作通常是異步的,數據首先寫入頁緩存,后續由內核的pdflush守護進程或kworker線程將緩存數據寫入磁盤。直接I/O則是同步的,數據直接寫入磁盤。
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{// 獲取文件關聯的inodestruct inode *inode = file_inode(iocb->ki_filp);if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))return -EIO;// 如果文件系統配置支持直接訪問,且inode也允許,則進行直接寫入
#ifdef CONFIG_FS_DAXif (IS_DAX(inode))return ext4_dax_write_iter(iocb, from);
#endif// 如果IO控制塊設置了IOCB_DIRECT,則執行直接IO寫入,繞過頁緩存if (iocb->ki_flags & IOCB_DIRECT)return ext4_dio_write_iter(iocb, from);// 否則進行緩存寫入return ext4_buffered_write_iter(iocb, from);
}
extent
在以下代碼中出現了extent,那么extent是什么呢?
extent是一段連續的物理塊,表示文件數據在磁盤上的位置和長度。
- 起始塊
- 物理塊
- 長度
每個文件都有一個與之關聯的 extent 樹,其根節點存儲在 inode 中。樹中的節點包含 extent 或指向子節點的指針。
葉子節點:存儲實際的 extent 信息(起始塊、物理塊和長度)
內部節點:存儲指向下一級節點的指針。
內聯數據
內聯數據適用于包含大量小文件場景,將小文件數據直接儲存到文件系統的元數據結構中,可以減少空間浪費
孤兒列表
孤兒列表用于跟蹤在文件操作中可能會被中途刪除或者截斷的文件,確保即使在系統崩潰的情況下也能被正確處理
比如,在文件刪除中,inode被更新表示文件被刪除了,但是系統中途崩潰了,而實際刪除工作在后面進行,就會導致這些文件變為孤兒,文件元數據仍然存在,可是文件本身被邏輯刪除了
ext4 buffered IO
buffered IO部分主要做了以下事情
- 鎖定inode,防止并發修改,保證page緩存的一致性
- 檢查寫入操作是否合法,并進行一些預處理
- 寫入
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,struct iov_iter *from)
{ssize_t ret;struct inode *inode = file_inode(iocb->ki_filp);// 如果 iocb 的標志中包含 IOCB_NOWAIT,則返回不支持的操作錯誤if (iocb->ki_flags & IOCB_NOWAIT)return -EOPNOTSUPP;// 加鎖以保護 inode 數據結構inode_lock(inode);// 檢查寫入操作是否合法,并進行一些預處理ret = ext4_write_checks(iocb, from);if (ret <= 0)goto out;// 設置當前進程的 backing_dev_info 為 inode 對應的設備current->backing_dev_info = inode_to_bdi(inode);// 執行通用的寫入操作,將數據寫入到文件中ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);// 清除當前進程的 backing_dev_infocurrent->backing_dev_info = NULL;out:// 解鎖 inodeinode_unlock(inode);// 如果寫入操作成功,則更新文件位置,并同步寫入數據if (likely(ret > 0)) {iocb->ki_pos += ret;ret = generic_write_sync(iocb, ret);}// 返回寫入的字節數或者錯誤碼return ret;
}
寫入的執行最后還是回到了VFS。generic_perform_write處理從用戶空間到文件的寫入數據,方法是遍歷數據塊、與頁面緩存交互以定位或分配頁面、將數據復制到這些頁面、更新文件的元數據、將頁面標記為臟頁面以便稍后回寫到存儲,以及確保整個過程中的數據完整性和錯誤處理。
ssize_t generic_perform_write(struct file *file,struct iov_iter *i, loff_t pos)
{struct address_space *mapping = file->f_mapping;const struct address_space_operations *a_ops = mapping->a_ops;long status = 0;ssize_t written = 0;unsigned int flags = 0;do {struct page *page;unsigned long offset; /* Offset into pagecache page */unsigned long bytes; /* Bytes to write to page */size_t copied; /* Bytes copied from user */void *fsdata;offset = (pos & (PAGE_SIZE - 1));bytes = min_t(unsigned long, PAGE_SIZE - offset,iov_iter_count(i));again:/** Bring in the user page that we will copy from _first_.* Otherwise there's a nasty deadlock on copying from the* same page as we're writing to, without it being marked* up-to-date.** Not only is this an optimisation, but it is also required* to check that the address is actually valid, when atomic* usercopies are used, below.*/// 錯誤處理和信號檢測if (unlikely(iov_iter_fault_in_readable(i, bytes))) {status = -EFAULT;break;}if (fatal_signal_pending(current)) {status = -EINTR;break;}// 負責將目標文件對應的頁加載到內存中,準備好緩沖區以便寫入數據。這個函數可能會涉及到文件系統特定的邏輯,例如預分配塊或者處理寫入鎖。status = a_ops->write_begin(file, mapping, pos, bytes, flags,&page, &fsdata);if (unlikely(status < 0))break;// 如果頁面映射到用戶空間并且可能被寫入,則確保緩存一致性,以防止緩存中的舊數據與內存中的新數據沖突。if (mapping_writably_mapped(mapping))flush_dcache_page(page);// 從用戶空間緩沖區復制數據到內核頁面緩存copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);flush_dcache_page(page);// 負責處理寫操作后的收尾工作,例如更新文件大小、標記頁面臟、解除頁面鎖定等status = a_ops->write_end(file, mapping, pos, bytes, copied,page, fsdata);if (unlikely(status < 0))break;copied = status;cond_resched();iov_iter_advance(i, copied);if (unlikely(copied == 0)) {/** If we were unable to copy any data at all, we must* fall back to a single segment length write.** If we didn't fallback here, we could livelock* because not all segments in the iov can be copied at* once without a pagefault.*/bytes = min_t(unsigned long, PAGE_SIZE - offset,iov_iter_single_seg_count(i));goto again;}pos += copied;written += copied;balance_dirty_pages_ratelimited(mapping);} while (iov_iter_count(i));return written ? written : status;
}
EXPORT_SYMBOL(generic_perform_write);
ext4 write begin
ext4_write_begin處理將數據寫入文件的準備工作。確保正確設置數據結構和狀態,以便實際的數據寫入操作順利進行
鎖定inode、在頁面緩存中分配頁面以及初始化日志事務以確保文件系統的一致性、確定需要修改的特定塊,并在必要時從磁盤讀取任何現有數據,以避免覆蓋塊的未初始化部分。
static int ext4_write_begin(struct file *file, struct address_space *mapping,loff_t pos, unsigned len, unsigned flags,struct page **pagep, void **fsdata)
{struct inode *inode = mapping->host;int ret, needed_blocks;handle_t *handle;int retries = 0;struct page *page;pgoff_t index;unsigned from, to;// 檢查文件系統是否被強制關閉if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))return -EIO;// 記錄寫入操作的跟蹤信息trace_ext4_write_begin(inode, pos, len, flags);/** Reserve one block more for addition to orphan list in case* we allocate blocks but write fails for some reason*/// 計算寫操作所需的塊數,包括一個額外的塊用于孤兒列表(orphan list)的添加needed_blocks = ext4_writepage_trans_blocks(inode) + 1;// 計算寫入位置的頁索引、起始偏移量和結束偏移量index = pos >> PAGE_SHIFT;from = pos & (PAGE_SIZE - 1);to = from + len;// 如果文件可能包含內聯數據,嘗試寫入內聯數據if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,flags, pagep);if (ret < 0)return ret;if (ret == 1)return 0;}// 進行事務處理之前需要先調用 grab_cache_page_write_begin() 獲取頁面。這樣做可以避免在高系統負載或內存壓力下造成的長時間等待,同時允許更靈活的內存分配,從而減少潛在的死鎖風險。這種策略有助于確保文件系統寫入操作的效率和可靠性
retry_grab:// 獲取要寫入的緩存頁。如果獲取失敗page = grab_cache_page_write_begin(mapping, index, flags);if (!page)return -ENOMEM;unlock_page(page);retry_journal:// 開始一個新的Ext4事務handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);if (IS_ERR(handle)) {put_page(page);return PTR_ERR(handle);}// 鎖定頁面并確保頁面穩定。如果頁面映射發生變化,重新獲取頁面lock_page(page);if (page->mapping != mapping) {/* The page got truncated from under us */unlock_page(page);put_page(page);ext4_journal_stop(handle);goto retry_grab;}/* In case writeback began while the page was unlocked */wait_for_stable_page(page);// 根據文件系統狀態選擇寫入方法,并執行實際的寫入操作
#ifdef CONFIG_FS_ENCRYPTIONif (ext4_should_dioread_nolock(inode))ret = ext4_block_write_begin(page, pos, len,ext4_get_block_unwritten);elseret = ext4_block_write_begin(page, pos, len,ext4_get_block);
#elseif (ext4_should_dioread_nolock(inode))ret = __block_write_begin(page, pos, len,ext4_get_block_unwritten);elseret = __block_write_begin(page, pos, len, ext4_get_block);
#endifif (!ret && ext4_should_journal_data(inode)) {ret = ext4_walk_page_buffers(handle, page_buffers(page),from, to, NULL,do_journal_get_write_access);}// 處理寫入過程中出現的錯誤。如果需要重試分配塊,重新開始事務if (ret) {bool extended = (pos + len > inode->i_size) &&!ext4_verity_in_progress(inode);unlock_page(page);/** __block_write_begin may have instantiated a few blocks* outside i_size. Trim these off again. Don't need* i_size_read because we hold i_mutex.** Add inode to orphan list in case we crash before* truncate finishes*/if (extended && ext4_can_truncate(inode))ext4_orphan_add(handle, inode);ext4_journal_stop(handle);if (extended) {ext4_truncate_failed_write(inode);/** If truncate failed early the inode might* still be on the orphan list; we need to* make sure the inode is removed from the* orphan list in that case.*/if (inode->i_nlink)ext4_orphan_del(NULL, inode);}if (ret == -ENOSPC &&ext4_should_retry_alloc(inode->i_sb, &retries))goto retry_journal;put_page(page);return ret;}*pagep = page;return ret;
}
block_write_begin通過映射頁中必要的塊來準備要寫的頁。遍歷每個塊,確保將其映射并標記為最新的,如果有必要,還會對需要從磁盤讀取的塊發起讀取,以避免覆蓋未初始化的數據
// ? page: 需要寫入數據的頁面。
// ? pos: 寫操作的起始位置。
// ? len: 寫入數據的長度。
// ? get_block: 用于映射邏輯塊號到物理塊號的回調函數。
// ? iomap: I/O 映射結構體,用于描述 I/O 操作。
int __block_write_begin(struct page *page, loff_t pos, unsigned len,get_block_t *get_block)
{return __block_write_begin_int(page, pos, len, get_block, NULL);
}
EXPORT_SYMBOL(__block_write_begin);int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,get_block_t *get_block, struct iomap *iomap)
{unsigned from = pos & (PAGE_SIZE - 1);unsigned to = from + len;struct inode *inode = page->mapping->host;unsigned block_start, block_end;sector_t block;int err = 0;unsigned blocksize, bbits;struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;BUG_ON(!PageLocked(page));BUG_ON(from > PAGE_SIZE);BUG_ON(to > PAGE_SIZE);BUG_ON(from > to);// 為頁面分配緩沖頭,并設置緩沖區大小和塊大小的位數head = create_page_buffers(page, inode, 0);blocksize = head->b_size;bbits = block_size_bits(blocksize);block = (sector_t)page->index << (PAGE_SHIFT - bbits);// 遍歷頁面的每個緩沖頭(buffer head),處理每個塊for(bh = head, block_start = 0; bh != head || !block_start;block++, block_start=block_end, bh = bh->b_this_page) {block_end = block_start + blocksize;if (block_end <= from || block_start >= to) {if (PageUptodate(page)) {if (!buffer_uptodate(bh))set_buffer_uptodate(bh);}continue;}if (buffer_new(bh))clear_buffer_new(bh);// 如果緩沖區尚未映射,則調用get_block或者iomap_to_bh進行塊映射if (!buffer_mapped(bh)) {WARN_ON(bh->b_size != blocksize);if (get_block) {err = get_block(inode, block, bh, 1);if (err)break;} else {iomap_to_bh(inode, block, bh, iomap);}if (buffer_new(bh)) {clean_bdev_bh_alias(bh);if (PageUptodate(page)) {clear_buffer_new(bh);set_buffer_uptodate(bh);mark_buffer_dirty(bh);continue;}if (block_end > to || block_start < from)zero_user_segments(page,to, block_end,block_start, from);continue;}}if (PageUptodate(page)) {if (!buffer_uptodate(bh))set_buffer_uptodate(bh);continue; }// 如果緩沖區未更新且未延遲,也未寫入,則從磁盤讀取塊數據if (!buffer_uptodate(bh) && !buffer_delay(bh) &&!buffer_unwritten(bh) &&(block_start < from || block_end > to)) {ll_rw_block(REQ_OP_READ, 0, 1, &bh);*wait_bh++=bh;}}/** If we issued read requests - let them complete.*/// 等待所有讀取操作完成while(wait_bh > wait) {wait_on_buffer(*--wait_bh);if (!buffer_uptodate(*wait_bh))err = -EIO;}if (unlikely(err))page_zero_new_buffers(page, from, to);return err;
}
ext4 write end
ext4_write_end對頁的數據寫入做收尾工作。
如果寫入擴展了文件,則更新inode大小,必要時將inode標記為臟的,并處理任何清理,包括處理日志事務,如果寫入部分失敗,則截斷超出新文件大小的未初始化塊。保證寫操作后數據的完整性和一致性。
/** We need to pick up the new inode size which generic_commit_write gave us* `file' can be NULL - eg, when called from page_symlink().** ext4 never places buffers on inode->i_mapping->private_list. metadata* buffers are managed internally.*/
static int ext4_write_end(struct file *file,struct address_space *mapping,loff_t pos, unsigned len, unsigned copied,struct page *page, void *fsdata)
{handle_t *handle = ext4_journal_current_handle();struct inode *inode = mapping->host;loff_t old_size = inode->i_size;int ret = 0, ret2;int i_size_changed = 0;int inline_data = ext4_has_inline_data(inode);bool verity = ext4_verity_in_progress(inode);trace_ext4_write_end(inode, pos, len, copied);// 包含內聯數據,處理內聯數據寫入,否則進行塊寫入if (inline_data) {ret = ext4_write_inline_data_end(inode, pos, len,copied, page);if (ret < 0) {unlock_page(page);put_page(page);goto errout;}copied = ret;} elsecopied = block_write_end(file, mapping, pos,len, copied, page, fsdata);/** it's important to update i_size while still holding page lock:* page writeout could otherwise come in and zero beyond i_size.** If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree* blocks are being written past EOF, so skip the i_size update.*/if (!verity)i_size_changed = ext4_update_inode_size(inode, pos + copied);unlock_page(page);put_page(page);// 如果舊文件大小小于寫入位置,且沒有正在進行的文件校驗擴展操作,更新頁面緩存的文件大小if (old_size < pos && !verity)pagecache_isize_extended(inode, old_size, pos);/** Don't mark the inode dirty under page lock. First, it unnecessarily* makes the holding time of page lock longer. Second, it forces lock* ordering of page lock and transaction start for journaling* filesystems.*/// 如果文件大小發生變化或包含內聯數據,標記inode為臟if (i_size_changed || inline_data)ext4_mark_inode_dirty(handle, inode);// 如果寫入位置加上寫入長度超過了文件大小,并且文件系統允許截斷,添加inode到孤兒列表if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))/* if we have allocated more blocks and copied* less. We will have blocks allocated outside* inode->i_size. So truncate them*/ext4_orphan_add(handle, inode);
errout:ret2 = ext4_journal_stop(handle);if (!ret)ret = ret2;if (pos + len > inode->i_size && !verity) {ext4_truncate_failed_write(inode);/** If truncate failed early the inode might still be* on the orphan list; we need to make sure the inode* is removed from the orphan list in that case.*/if (inode->i_nlink)ext4_orphan_del(NULL, inode);}return ret ? ret : copied;
}
ext4 direct IO
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{ssize_t ret;size_t count;loff_t offset;handle_t *handle;struct inode *inode = file_inode(iocb->ki_filp);bool extend = false, overwrite = false, unaligned_aio = false;// 鎖定inode。if (iocb->ki_flags & IOCB_NOWAIT) {if (!inode_trylock(inode))return -EAGAIN;} else {inode_lock(inode);}// 檢查是否支持直接IOif (!ext4_dio_supported(inode)) {inode_unlock(inode);/** Fallback to buffered I/O if the inode does not support* direct I/O.*/return ext4_buffered_write_iter(iocb, from);}// 寫入前檢查ret = ext4_write_checks(iocb, from);if (ret <= 0) {inode_unlock(inode);return ret;}// 同步未對齊的異步direct IO,防止數據損壞offset = iocb->ki_pos;count = iov_iter_count(from);if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&!is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {unaligned_aio = true;inode_dio_wait(inode);}/** Determine whether the I/O will overwrite allocated and initialized* blocks. If so, check to see whether it is possible to take the* dioread_nolock path.*/// 如果IO對齊且I/O覆蓋已分配和初始化的塊且 inode 支持無鎖直接讀取,則設置 overwrite 并降級寫鎖if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&ext4_should_dioread_nolock(inode)) {overwrite = true;downgrade_write(&inode->i_rwsem);}// 檢查寫操作的結束(offset + count)是否超過了inode的當前磁盤大小,啟動一個日志句柄來安全地管理對inode的更改,將該inode添加到孤立列表中,以處理寫操作期間可能發生的崩潰,并設置一個標志(extend),表示將擴展inode的大小。然后停止日志記錄句柄if (offset + count > EXT4_I(inode)->i_disksize) {handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);if (IS_ERR(handle)) {ret = PTR_ERR(handle);goto out;}ret = ext4_orphan_add(handle, inode);if (ret) {ext4_journal_stop(handle);goto out;}extend = true;ext4_journal_stop(handle);}// 執行直接 I/O 寫入ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,is_sync_kiocb(iocb) || unaligned_aio || extend);// 如果是擴展操作,需要再次啟動一個事務并更新磁盤大小if (extend)ret = ext4_handle_inode_extension(inode, offset, ret, count);out:if (overwrite)inode_unlock_shared(inode);elseinode_unlock(inode);if (ret >= 0 && iov_iter_count(from)) {ssize_t err;loff_t endbyte;// 回退到緩沖IOoffset = iocb->ki_pos;err = ext4_buffered_write_iter(iocb, from);if (err < 0)return err;// 在當前 I/O 操作覆蓋的范圍內,確保頁面緩存中的頁面被寫入磁盤并失效(即使緩存無效)。這是為了在必要時回退到緩沖 I/O 時,盡量保持直接 I/O 的語義ret += err;endbyte = offset + err - 1;err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,offset, endbyte);if (!err)invalidate_mapping_pages(iocb->ki_filp->f_mapping,offset >> PAGE_SHIFT,endbyte >> PAGE_SHIFT);}return ret;
}
ext4 BIO與DIO代碼有感
ext4 BIO(Buffered IO)與DIO(Direct IO)
-
ext4 BIO與DIO都嘗試對inode進行鎖定。不同的是DIO還允許無等待,也就是在鎖已經被獲取的情況下,直接返回
-
BIO經過內核page緩存,而DIO則直接從用戶空間寫入到設備
-
DIO還確保寫入操作覆蓋范圍內的緩存頁面被寫入磁盤并失效,以保證直接 I/O 語義,和未對齊的異步直接 I/O 寫入,防止數據損壞
Ref
- https://elixir.bootlin.com/linux/v5.5-rc2/source