FS

FS

Mass-Storage Structure

disk:掉电保存,永久存储

memory:掉电即无

RAID:Redundant Array of Independent Disks

提高可靠性,冗余磁盘来降低出错的概率

  • Data Mirroring

    数据镜像

  • Data Striping

    并行读取文件

  • Error Code Correctiong

    校错

RAID Levels:

  • RAID 0:将数据splits成data,存到block上

    只提高了性能,但没有提高可靠性

  • RAID 1:将所有的文件都存储两份到所有磁盘上

    优点:提高了可靠性

    缺点:浪费空间,所有的文件都有备份

  • RAID 2:将文件以bit-level拆开,使用海明码纠错

    海明码:4 bit data + 3 bit parity,使用7个磁盘

    缺点:bit-level 读

  • RAID 3:同样是bit-level拆开,使用xor纠错

    优点:可以加快读的速度

    缺点:恢复的时间比较长

  • RAID 4,5,6:

    • 4:使用block作为切分,整体计算校验码
    • 5:使校验码均匀分布在所有磁盘中
    • 6:与5相同,但扩展了额外的校验快

I/O Management

  • polling:轮询

    CPU主动访问

  • interrupt:中断

    CPU被访问

Hardware support:

  • I/O instruction
  • register
  • memory-support

Everything is file.

// /dev/ptmx
static struct file_operations ptmx_fops __ro_after_init;

static void __init unix98_pty_init(void)
{
	ptm_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX,
			TTY_DRIVER_RESET_TERMIOS |
			TTY_DRIVER_REAL_RAW |
			TTY_DRIVER_DYNAMIC_DEV |
			TTY_DRIVER_DEVPTS_MEM |
			TTY_DRIVER_DYNAMIC_ALLOC);
	if (IS_ERR(ptm_driver))
		panic("Couldn't allocate Unix98 ptm driver");
	pts_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX,
			TTY_DRIVER_RESET_TERMIOS |
			TTY_DRIVER_REAL_RAW |
			TTY_DRIVER_DYNAMIC_DEV |
			TTY_DRIVER_DEVPTS_MEM |
			TTY_DRIVER_DYNAMIC_ALLOC);
	if (IS_ERR(pts_driver))
		panic("Couldn't allocate Unix98 pts driver");

	ptm_driver->driver_name = "pty_master";
	ptm_driver->name = "ptm";
	ptm_driver->major = UNIX98_PTY_MASTER_MAJOR;
	ptm_driver->minor_start = 0;
	ptm_driver->type = TTY_DRIVER_TYPE_PTY;
	ptm_driver->subtype = PTY_TYPE_MASTER;
	ptm_driver->init_termios = tty_std_termios;
	ptm_driver->init_termios.c_iflag = 0;
	ptm_driver->init_termios.c_oflag = 0;
	ptm_driver->init_termios.c_cflag = B38400 | CS8 | CREAD;
	ptm_driver->init_termios.c_lflag = 0;
	ptm_driver->init_termios.c_ispeed = 38400;
	ptm_driver->init_termios.c_ospeed = 38400;
	ptm_driver->other = pts_driver;
	tty_set_operations(ptm_driver, &ptm_unix98_ops);

	pts_driver->driver_name = "pty_slave";
	pts_driver->name = "pts";
	pts_driver->major = UNIX98_PTY_SLAVE_MAJOR;
	pts_driver->minor_start = 0;
	pts_driver->type = TTY_DRIVER_TYPE_PTY;
	pts_driver->subtype = PTY_TYPE_SLAVE;
	pts_driver->init_termios = tty_std_termios;
	pts_driver->init_termios.c_cflag = B38400 | CS8 | CREAD;
	pts_driver->init_termios.c_ispeed = 38400;
	pts_driver->init_termios.c_ospeed = 38400;
	pts_driver->other = ptm_driver;
	tty_set_operations(pts_driver, &pty_unix98_ops);

	if (tty_register_driver(ptm_driver))
		panic("Couldn't register Unix98 ptm driver");
	if (tty_register_driver(pts_driver))
		panic("Couldn't register Unix98 pts driver");

	/* Now create the /dev/ptmx special device */
	tty_default_fops(&ptmx_fops);
	ptmx_fops.open = ptmx_open;

	cdev_init(&ptmx_cdev, &ptmx_fops);
	if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) ||
	    register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0)
		panic("Couldn't register /dev/ptmx driver");
	device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx");
}

//file_operation

struct file_operations {
	struct module *owner;
	loff_t (*llseek) (struct file *, loff_t, int);
	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
	int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
			unsigned int flags);
	int (*iterate) (struct file *, struct dir_context *);
	int (*iterate_shared) (struct file *, struct dir_context *);
	__poll_t (*poll) (struct file *, struct poll_table_struct *);
	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
	int (*mmap) (struct file *, struct vm_area_struct *);
	unsigned long mmap_supported_flags;
	int (*open) (struct inode *, struct file *);
	int (*flush) (struct file *, fl_owner_t id);
	int (*release) (struct inode *, struct file *);
	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
	int (*fasync) (int, struct file *, int);
	int (*lock) (struct file *, int, struct file_lock *);
	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
	int (*check_flags)(int);
	int (*flock) (struct file *, int, struct file_lock *);
	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
	int (*setlease)(struct file *, long, struct file_lock **, void **);
	long (*fallocate)(struct file *file, int mode, loff_t offset,
			  loff_t len);
	void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
	unsigned (*mmap_capabilities)(struct file *);
#endif
	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
			loff_t, size_t, unsigned int);
	loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
				   struct file *file_out, loff_t pos_out,
				   loff_t len, unsigned int remap_flags);
	int (*fadvise)(struct file *, loff_t, loff_t, int);
	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
	int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
				unsigned int poll_flags);
} __randomize_layout;

使用文件的读写,来操作对应的硬件

在上述的函数中,将对应的IO设备和和文件指针绑定起来,作为initialization

使用不同的函数指针,来针对不同的IO设备做处理

ioctl:

File system

使用文件系统作存储和IO的抽象

File: a contigurous logical space for storing information

  • proc file system:在磁盘上没有对应的文件,称为in memory file system

    内存中文件系统,保存了程序的运行信息

文件的组成:

  • name

  • identifier

  • type

    win中,使用文件扩展名来判断

    Linux中使用magic number来区分文件类型,没有后缀名一说

  • location

  • size

  • protection

  • structure

    有的没有具体的结构,有的会有database的结构

    • directory structure:文件夹,包含文件信息的目录集合

      可以delete, list

  • time, data and user identification

文件操作:

  • create

  • open:将文件的元信息载入到内存中

    在打开时,会涉及到共享内存的问题

  • read/write:需要维护一个pointer

  • seek

  • close

  • delete

  • truncate

访问方式:

  • sequential access
  • direct access

单级文件系统 -> 二级文件系统 -> 树形文件系统 -> 无环图

所以就有了:

  • absolute path
  • relative path

无环图:添加了回指的指针,并且会检测每次添加是否有一个环产生

挂载:将文件系统载入到内存中,让file_operation指向正确的函数

  • mount point:挂载点

文件共享:uid, gid

  • remote file sharing:FTP

文件保护(访问保护):

  • ACL(Access control list)

    $ chmod xxx
      
    u(user)    g(group)    o(other)
    

FS implementation

  • File system structure:

    U盘使用的是FAT文件系统,linux默认是Ext 2/4

    存储在硬盘上,为程序提供了接口

    • File system layers:

      app --> logic --> file origanization ---> basic file system ---> IO control ---> device
      
      • logical file system:

        • meta-data
        • directory
        • File control block:FCB
          • name
          • permissions
          • dates
          • owner, group, ACL
          • size
        • input
        • output
      • file organization module

        输入逻辑块,输出物理块

      • basic file system(blocks)

        buffers, caches

      • IO control

      • device

on disk structure:永久性的断电不丢的

  • volume control block

    包括了文件系统的meta-data

  • directory

  • per-file File Control Block

in memory structure:断电丢失的

  • mount table

  • directory cache

  • global open-file table

  • per-process open-file table

    fid:文件在该表中的索引

针对文件操作:

  • create:

    创建一个新的FCB

  • open:

    首先在global中搜索是否已经打开

    • 打开:创建接口

    • 关闭:将FCB载入到内存中,并写入到per process open-file table和global open-file table

      并对打开文件进行计数

  • close

    关闭所有的打开文件后,关闭该文件入口

UFS:使用inode来唯一指定文件

文件系统的挂载::

boot loader

volume control table -- memory --> mount table

virtual file system:

提供了类似于面向对象的实现

面向上层:提供统一的接口

面向下层:为不同的文件系统,使用不同的函数指针

  • super block
  • inode
  • dentry
  • file

目录的实现:

存储了以inode为索引的目录项

  • 线性链表
  • 哈希表

物理存储空间的分配:

  • 连续分配

    external fragmentation

  • 链表式分配

    每次都要做一次链表式的查找

    FAT-File allocation table

  • 索引式分配

    为每个文件提供一个索引表,顺序遍历索引表

    表的大小限制了文件的大小:将该表链起来,依次来扩容

    多级索引表:类似page table的实现

    image-20221227174621624

128 pointers:
125 data pointers + 1 single in + 1 double in + 1 triple in

管理free-space

  • bit map

  • linked free space

    不能很轻易地拿到全部的block

  • group and counting

提升file system的性能:

  • datametadata放到一起
  • cache
  • asynchronous writes
  • free-behind and read-ahead
  • reads frequently slower than write

使用page(file) cache

文件恢复:

  • back up

  • LSFS:Log Structured File Systems

    将操作,先记到log中,之后再进行具体操作

In practice

  • external name:面向用户
  • internal name:面向机器

directory: external name <---> internal name

fd: file descriptor

  • hard link: a directory entry

    inode

  • soft link: a file

    inode

soft link会更灵活