一个目录中有多少个文件对于 ext4 来说太多了?

一个目录中有多少个文件对于 ext4 来说太多了?

我编写了一个 Golang 程序来测量在包含 2k 文件的目录与包含 200k 文件的目录中创建 100 个新文件并读取 100 个现有文件所需的时间:

// Create 200k files in one directory vs 200k files in 100 separate directories
// See if speed of accessing files is affected
package main

import (
    "fmt"
    "log"
    "os"
    "time"

    "github.com/1f604/util"
)

func main() {
    // First, create 100 directories
    filepaths := []string{}
    for i := 0; i < 100; i++ {
        newfilepath := "/tmp/dir" + util.Int64_to_string(int64(i)) + "/"
        filepaths = append(filepaths, newfilepath)
        err := os.MkdirAll(newfilepath, os.ModePerm)
        util.Check_err(err)
    }
    fmt.Println("Created 100 directories.")
    // Next, create 2k files in each directory
    fmt.Println("Now creating 2k x 10kb files in each small directory.")
    for i := 0; i < 100; i++ {
        for j := 0; j < 2000; j++ {
            f, err := os.Create("/tmp/dir" + util.Int64_to_string(int64(i)) + "/" + util.Int64_to_string(int64(j)) + ".txt")
            if err != nil {
                log.Fatal(err)
            }

            if err := f.Truncate(1e4); err != nil {
                log.Fatal(err)
            }
        }
    }

    // Next, create 200k files in one directory
    fmt.Println("Now creating 200k x 10kb files in one big directory.")
    for j := 0; j < 200000; j++ {
        f, err := os.Create("/tmp/bigdir/" + util.Int64_to_string(int64(j)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }

        if err := f.Truncate(1e4); err != nil {
            log.Fatal(err)
        }
    }

    // Now time read and write times
    fmt.Println("Now creating 100 x 10kb files in a small directory.")
    start := time.Now()
    for j := 0; j < 100; j++ {
        f, err := os.Create("/tmp/dir1/test" + util.Int64_to_string(int64(j)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }

        if err := f.Truncate(1e4); err != nil {
            log.Fatal(err)
        }
    }
    fmt.Println("Time taken:", time.Now().Sub(start))

    fmt.Println("Now reading 100 random 10kb files in a small directory.")
    start = time.Now()
    list := [][]byte{}
    for j := 0; j < 100; j++ {
        num, err := util.Crypto_Randint(2000)
        util.Check_err(err)
        contents, err := os.ReadFile("/tmp/dir2/" + util.Int64_to_string(int64(num)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }
        list = append(list, contents)
    }
    fmt.Println("Time taken:", time.Now().Sub(start))

    fmt.Println("Now creating 100 x 10kb files in a big directory.")
    start = time.Now()
    for j := 0; j < 100; j++ {
        f, err := os.Create("/tmp/bigdir/test" + util.Int64_to_string(int64(j)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }

        if err := f.Truncate(1e4); err != nil {
            log.Fatal(err)
        }
    }
    fmt.Println("Time taken:", time.Now().Sub(start))

    fmt.Println("Now reading 100 random 10kb files in a big directory.")
    start = time.Now()
    for j := 0; j < 100; j++ {
        num, err := util.Crypto_Randint(200000)
        util.Check_err(err)
        contents, err := os.ReadFile("/tmp/bigdir/" + util.Int64_to_string(int64(num)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }
        list = append(list, contents)
    }
    fmt.Println("Time taken:", time.Now().Sub(start))

    start = time.Now()
}

以下是我在 Debian 12 (ext4) 系统上的结果:

Created 100 directories.
Now creating 2k x 10kb files in each small directory.
Now creating 200k x 10kb files in one big directory.
Now creating 100 x 10kb files in a small directory.
Time taken: 2.361316ms
Now reading 100 random 10kb files in a small directory.
Time taken: 5.792292ms
Now creating 100 x 10kb files in a big directory.
Time taken: 2.922209ms
Now reading 100 random 10kb files in a big directory.
Time taken: 3.835541ms

从大目录中读取 100 个随机文件始终比从小目录中读取 100 个随机文件快,但这怎么可能呢?

我的基准测试代码不正确吗?

谢谢。

更新:在应用@Paul_Pedant的建议创建文件后刷新页面缓存后,我得到了完全不同的结果!

这是我的新结果:

Now creating 100 x 10kb files in a small directory.
Time taken: 19.475348ms
Now reading 100 random 10kb files in a small directory.
Time taken: 26.309475ms
Now creating 100 x 10kb files in a big directory.
Time taken: 75.570411ms
Now reading 100 random 10kb files in a big directory.
Time taken: 152.495391ms

这表明我之前看到的令人惊讶的结果只是由于页面缓存的影响,从200K文件目录中读取100个随机文件确实比从2K文件目录中读取100个随机文件慢得多(152ms vs 26ms)

更新:我意识到我最初的测试不公平,因为我从同一个小目录访问所有 100 个文件,但在实际场景中我将从随机目录访问它们。

所以我更新了我的基准测试程序以使其更加现实(注意:该程序假设您已经创建了目录和文件。您需要在运行该程序之前刷新页面缓存):

package main

import (
    "fmt"
    "log"
    "os"
    "time"

    "math/rand"

    "github.com/1f604/util"
)

func main() {
    // Now time read and write times
    fmt.Println("Now creating 100 x 10kb files in a small directory.")
    start := time.Now()
    for j := 0; j < 100; j++ {
        num1 := rand.Intn(100)
        num2 := rand.Intn(2000)
        f, err := os.Create("/tmp/dir" + util.Int64_to_string(int64(num1)) + "/test" + util.Int64_to_string(int64(num2)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }

        if err := f.Truncate(1e5); err != nil {
            log.Fatal(err)
        }
    }
    fmt.Println("Time taken:", time.Now().Sub(start))

    fmt.Println("Now reading 1000 random 10kb files in a small directory.")
    start = time.Now()
    list := [][]byte{}
    for j := 0; j < 1000; j++ {
        num1 := rand.Intn(100)
        num2 := rand.Intn(2000)
        contents, err := os.ReadFile("/tmp/dir" + util.Int64_to_string(int64(num1)) + "/" + util.Int64_to_string(int64(num2)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }
        list = append(list, contents)
    }
    fmt.Println("Time taken:", time.Now().Sub(start))

    fmt.Println("Now creating 100 x 10kb files in a big directory.")
    start = time.Now()
    for j := 0; j < 100; j++ {
        f, err := os.Create("/tmp/bigdir/test" + util.Int64_to_string(int64(j)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }

        if err := f.Truncate(1e5); err != nil {
            log.Fatal(err)
        }
    }
    fmt.Println("Time taken:", time.Now().Sub(start))

    fmt.Println("Now reading 1000 random 10kb files in a big directory.")
    start = time.Now()
    for j := 0; j < 1000; j++ {
        num := rand.Intn(200000)
        contents, err := os.ReadFile("/tmp/bigdir/" + util.Int64_to_string(int64(num)) + ".txt")
        if err != nil {
            log.Fatal(err)
        }
        list = append(list, contents)
    }
    fmt.Println("Time taken:", time.Now().Sub(start))
}

以下是我的新结果:


Now creating 100 x 10kb files in a small directory.
Time taken: 70.31699ms
Now reading 1000 random 10kb files in a small directory.
Time taken: 758.609004ms
Now creating 100 x 10kb files in a big directory.
Time taken: 32.695134ms
Now reading 1000 random 10kb files in a big directory.
Time taken: 574.266544ms

(这些结果是刷新页面缓存后获得的)

现在看来小目录的所有优势都消失了。相反,大目录现在似乎更快。

我认为这表明如果重复访问同一目录那么会使后续文件访问更快?另一种解释是,由于文件非常小 (10kb),因此它们位于物理设备上的同一块中,因此访问附近的文件速度更快。但我不知道。

答案1

一个目录中有多少个文件对于 ext4 来说太多了?

相关内容