Ruby 入门指南 / 第 14 章：文件与数据

第 14 章：文件与数据

“数据是新时代的石油，文件操作是炼油的技术。”

14.1 文件读取

14.1.1 基本读取

# 一次性读取整个文件
content = File.read("data.txt")
content = File.read("data.txt", encoding: "UTF-8")

# 读取为行数组
lines = File.readlines("data.txt")
lines = File.readlines("data.txt", chomp: true)  # 去除换行符

# 逐行读取（大文件推荐）
File.foreach("data.txt") do |line|
  puts line.chomp
end

# 带索引的逐行读取
File.foreach("data.txt").with_index do |line, index|
  puts "#{index + 1}: #{line.chomp}"
end

14.1.2 块模式读取

# 使用块确保文件关闭
File.open("data.txt", "r") do |file|
  file.each_line do |line|
    puts line.chomp
  end
end

# 二进制读取
File.open("image.png", "rb") do |file|
  data = file.read
  puts "Size: #{data.bytesize} bytes"
end

14.1.3 逐块读取

# 逐块读取大文件
File.open("large_file.bin", "rb") do |file|
  while (chunk = file.read(4096))
    process_chunk(chunk)
  end
end

# 带偏移量读取
File.open("data.txt") do |file|
  file.seek(10)           # 跳到第 10 字节
  data = file.read(20)    # 读取 20 字节
  puts file.pos           # 当前位置
end

14.2 文件写入

14.2.1 基本写入

# 覆盖写入
File.write("output.txt", "Hello, World!\n")

# 追加写入
File.open("log.txt", "a") do |file|
  file.puts "[#{Time.now}] New entry"
end

# 追加模式简写
File.write("log.txt", "New line\n", mode: "a")

# 写入多行
File.open("data.txt", "w") do |file|
  file.puts "Line 1"
  file.puts "Line 2"
  file.puts "Line 3"
end

14.2.2 临时文件

require "tempfile"

# 创建临时文件
Tempfile.create(["output", ".txt"]) do |file|
  file.write("Temporary data")
  file.rewind
  puts file.read
end
# 临时文件不会自动删除

# 块模式（自动清理）
Tempfile.open(["output", ".txt"]) do |file|
  file.write("Temporary data")
  file.rewind
  puts file.read
end
# 文件在块结束后自动删除

# 指定临时目录
Tempfile.create(["output", ".txt"], "/tmp") do |file|
  file.write("Data in /tmp")
end

14.3 文件操作

14.3.1 文件信息

# 文件存在性
File.exist?("data.txt")       # => true/false
File.file?("data.txt")        # => true（是文件）
File.directory?("my_dir")     # => true（是目录）
File.symlink?("link")         # => true（是符号链接）

# 文件大小和时间
File.size("data.txt")         # => 字节数
File.mtime("data.txt")        # => 最后修改时间
File.atime("data.txt")        # => 最后访问时间
File.ctime("data.txt")        # => 创建时间（Unix）

# 文件权限
File.readable?("data.txt")    # => 可读
File.writable?("data.txt")    # => 可写
File.executable?("script.sh") # => 可执行

# 文件扩展名和路径
File.extname("data.txt")      # => ".txt"
File.basename("dir/data.txt") # => "data.txt"
File.basename("data.txt", ".txt")  # => "data"
File.dirname("dir/data.txt")  # => "dir"
File.split("dir/data.txt")    # => ["dir", "data.txt"]
File.expand_path("../data.txt", __FILE__)  # => 绝对路径

14.3.2 目录操作

# 创建目录
Dir.mkdir("new_dir")
Dir.mkdir("new_dir", 0755)  # 带权限

# 递归创建
require "fileutils"
FileUtils.mkdir_p("path/to/deep/dir")

# 删除目录
Dir.rmdir("empty_dir")
Dir.delete("empty_dir")

# 列出目录内容
Dir.entries(".")              # 所有条目
Dir.children(".")             # 不含 . 和 ..
Dir.glob("*")                 # 通配符匹配
Dir.glob("**/*.rb")           # 递归匹配
Dir.glob("*.{rb,py,js}")      # 多扩展名

# 遍历目录
Dir.glob("**/*.rb").each do |file|
  puts file
end

# 当前目录和切换
Dir.pwd                       # 当前目录
Dir.chdir("/tmp")             # 切换目录
Dir.home                      # 用户主目录
Dir.tmpdir                    # 临时目录

14.3.3 文件操作

require "fileutils"

# 复制文件
FileUtils.cp("source.txt", "dest.txt")
FileUtils.cp_r("source_dir", "dest_dir")  # 递归复制

# 移动/重命名
FileUtils.mv("old.txt", "new.txt")

# 删除文件
FileUtils.rm("file.txt")
FileUtils.rm_rf("dir")  # 递归删除（危险！）

# 创建链接
FileUtils.ln_s("target", "link")  # 符号链接

# 比较文件
FileUtils.identical?("file1.txt", "file2.txt")  # => true/false

# 安全写入（原子操作）
FileUtils.touch("file.txt")  # 创建或更新时间戳

14.4 YAML 处理

14.4.1 读写 YAML

require "yaml"

# 读取 YAML
config = YAML.load_file("config.yml")
config = YAML.safe_load_file("config.yml", permitted_classes: [Symbol, Date, Time])

# 解析 YAML 字符串
data = YAML.parse("---\nname: Alice\nage: 25\n")
hash = YAML.load("---\nname: Alice\nage: 25\n")

# 写入 YAML
data = { name: "Alice", age: 25, hobbies: ["reading", "coding"] }
File.write("data.yml", data.to_yaml)

# YAML.dump
File.open("data.yml", "w") do |file|
  YAML.dump(data, file)
end

14.4.2 YAML 配置文件

# config.yml
# database:
#   adapter: postgresql
#   host: localhost
#   port: 5432
#   name: myapp_development
#
# server:
#   host: 0.0.0.0
#   port: 3000
#
# logging:
#   level: info
#   file: log/app.log

# 加载配置
class Config
  def initialize(path = "config.yml")
    @data = YAML.safe_load_file(path, permitted_classes: [Symbol]) if File.exist?(path)
    @data ||= {}
  end

  def get(key, default = nil)
    keys = key.to_s.split(".")
    value = keys.reduce(@data) { |hash, k| hash&.fetch(k, nil) }
    value || default
  end

  def method_missing(name, *args)
    get(name) || super
  end
end

config = Config.new
config.database.host  # => "localhost"
config.server.port    # => 3000

14.5 JSON 处理

14.5.1 基本操作

require "json"

# 解析 JSON
json_string = '{"name": "Alice", "age": 25, "hobbies": ["reading", "coding"]}'
data = JSON.parse(json_string)
data["name"]     # => "Alice"
data["hobbies"]  # => ["reading", "coding"]

# 从文件读取
data = JSON.parse(File.read("data.json"))
data = JSON.load_file("data.json")  # Ruby 3.1+

# 生成 JSON
hash = { name: "Alice", age: 25, active: true }
json = hash.to_json
# => '{"name":"Alice","age":25,"active":true}'

# 格式化输出
json = JSON.pretty_generate(hash)
# {
#   "name": "Alice",
#   "age": 25,
#   "active": true
# }

# 写入文件
File.write("output.json", JSON.pretty_generate(data))

14.5.2 JSON 处理技巧

require "json"

# 嵌套数据
users = [
  {
    id: 1,
    name: "Alice",
    profile: {
      email: "[email protected]",
      age: 25
    }
  },
  {
    id: 2,
    name: "Bob",
    profile: {
      email: "[email protected]",
      age: 30
    }
  }
]

# 转为 JSON
json = users.to_json

# 解析并处理
parsed = JSON.parse(json, symbolize_names: true)
parsed.each do |user|
  puts "#{user[:name]}: #{user[:profile][:email]}"
end

# 使用 JSON::Ext 加速（默认已包含）
# 可选：使用 Oj gem 获取更好性能
# require "oj"
# Oj.load(json_string)
# Oj.dump(ruby_object)

14.6 CSV 处理

14.6.1 读取 CSV

require "csv"

# 读取为数组
CSV.foreach("data.csv") do |row|
  puts row.inspect  # => ["name", "age", "city"]
end

# 读取为哈希（带表头）
CSV.foreach("data.csv", headers: true) do |row|
  puts "#{row['name']}: #{row['age']}"
end

# 一次性读取
table = CSV.read("data.csv", headers: true)
table.each do |row|
  puts row["name"]
end

# 转为数组
rows = CSV.read("data.csv")  # => [["name", "age"], ["Alice", "25"], ...]

# 转为哈希数组
data = CSV.table("data.csv")  # => [{name: "Alice", age: 25}, ...]

14.6.2 写入 CSV

require "csv"

# 写入为数组
CSV.open("output.csv", "w") do |csv|
  csv << ["name", "age", "city"]
  csv << ["Alice", 25, "Beijing"]
  csv << ["Bob", 30, "Shanghai"]
end

# 字符串输出
csv_string = CSV.generate do |csv|
  csv << ["name", "age"]
  csv << ["Alice", 25]
  csv << ["Bob", 30]
end
puts csv_string

# 使用 headers 选项写入
CSV.open("output.csv", "w", headers: %w[name age city], write_headers: true) do |csv|
  csv << ["Alice", 25, "Beijing"]
  csv << ["Bob", 30, "Shanghai"]
end

14.6.3 CSV 数据处理

require "csv"

# 读取并处理
users = CSV.read("users.csv", headers: true)

# 过滤
adults = users.select { |row| row["age"].to_i >= 18 }

# 转换
names = users.map { |row| row["name"] }

# 分组
by_city = users.group_by { |row| row["city"] }

# 聚合
avg_age = users.sum { |row| row["age"].to_f } / users.length

# 写出处理结果
CSV.open("adults.csv", "w", headers: users.headers, write_headers: true) do |csv|
  adults.each { |row| csv << row }
end

# 自定义列分隔符
CSV.foreach("data.tsv", col_sep: "\t") do |row|
  puts row.inspect
end

14.7 序列化

14.7.1 Marshal 序列化

# Marshal 是 Ruby 内置的二进制序列化
data = { name: "Alice", scores: [95, 87, 92], active: true }

# 序列化为二进制
binary = Marshal.dump(data)
File.write("data.marshal", binary)

# 反序列化
loaded = Marshal.load(File.read("data.marshal"))
loaded[:name]  # => "Alice"

# 深拷贝（使用 Marshal）
original = { a: [1, 2, 3], b: { c: 4 } }
copy = Marshal.load(Marshal.dump(original))
copy[:a] << 4
original[:a]  # => [1, 2, 3]（不受影响）

14.7.2 自定义序列化

class User
  attr_reader :name, :email, :created_at

  def initialize(name, email)
    @name = name
    @email = email
    @created_at = Time.now
  end

  # 自定义 Marshal 序列化
  def marshal_dump
    { name: @name, email: @email, created_at: @created_at.to_s }
  end

  def marshal_load(data)
    @name = data[:name]
    @email = data[:email]
    @created_at = Time.parse(data[:created_at])
  end

  # JSON 序列化
  def to_json(*args)
    { name: @name, email: @email, created_at: @created_at.to_s }.to_json(*args)
  end

  def self.from_json(json_string)
    data = JSON.parse(json_string, symbolize_names: true)
    user = new(data[:name], data[:email])
    user
  end
end

user = User.new("Alice", "[email protected]")

# Marshal
binary = Marshal.dump(user)
restored = Marshal.load(binary)

# JSON
json = user.to_json
restored = User.from_json(json)

14.8 路径处理

14.8.1 Pathname

require "pathname"

# 创建 Pathname
path = Pathname.new("/home/user/data.txt")
path = Pathname("/home/user/data.txt")

# 路径信息
path.basename       # => #<Pathname:data.txt>
path.basename(".txt") # => #<Pathname:data>
path.dirname        # => #<Pathname:/home/user>
path.extname        # => ".txt"
path.to_s           # => "/home/user/data.txt"

# 路径操作
path.parent         # => #<Pathname:/home/user>
path + "other.txt"  # => #<Pathname:/home/user/other.txt>
path.expand_path    # => 绝对路径

# 文件操作
path.exist?         # => true/false
path.file?          # => true
path.directory?     # => false
path.read           # => 文件内容
path.write("data")  # 写入
path.size           # => 字节数

# 目录遍历
Pathname(".").children                        # 当前目录子项
Pathname(".").glob("**/*.rb")                 # 递归查找

# 链式操作
Pathname("output")
  .join("data", "results")
  .expand_path
  .parent

14.9 实际业务场景

14.9.1 日志处理

require "csv"
require "json"

class LogParser
  def initialize(log_file)
    @log_file = log_file
  end

  def parse
    entries = []
    
    File.foreach(@log_file) do |line|
      match = line.match(/\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] (\w+) (.+)/)
      if match
        entries << {
          time: Time.parse(match[1]),
          level: match[2],
          message: match[3]
        }
      end
    end

    entries
  end

  def to_csv(output_file)
    entries = parse
    
    CSV.open(output_file, "w", write_headers: true, 
      headers: %w[time level message]) do |csv|
      entries.each do |entry|
        csv << [entry[:time], entry[:level], entry[:message]]
      end
    end
  end

  def statistics
    entries = parse
    {
      total: entries.length,
      by_level: entries.group_by { |e| e[:level] }.transform_values(&:count),
      first: entries.first&.dig(:time),
      last: entries.last&.dig(:time)
    }
  end
end

14.9.2 配置管理器

require "yaml"
require "json"

class ConfigManager
  def initialize(config_dir = "config")
    @config_dir = config_dir
    @config = {}
    load_all
  end

  def get(key_path, default = nil)
    keys = key_path.to_s.split(".")
    value = keys.reduce(@config) { |h, k| h.is_a?(Hash) ? h[k] || h[k.to_sym] : nil }
    value || default
  end

  private

  def load_all
    Dir.glob(File.join(@config_dir, "*.{yml,yaml,json}")).each do |file|
      data = case File.extname(file)
             when ".yml", ".yaml"
               YAML.safe_load_file(file, permitted_classes: [Symbol, Date, Time])
             when ".json"
               JSON.parse(File.read(file))
             end
      @config.merge!(data || {})
    end
  end
end

config = ConfigManager.new
config.get("database.host")           # => "localhost"
config.get("database.pool_size", 5)   # => 5

14.10 动手练习

文件搜索工具

# 实现 grep 类似的文件搜索工具
# search_files("*.rb", /def \w+/, "/path/to/dir")
def search_files(pattern, regex, dir)
  # 你的代码...
end

CSV 转 JSON

# 将 CSV 文件转换为 JSON 格式
def csv_to_json(csv_file, json_file)
  # 你的代码...
end

# 统计目录中各类型文件的数量和大小
def directory_stats(dir)
  # 你的代码...
end

14.11 本章小结

要点	说明
File	基本文件操作类
FileUtils	高级文件操作工具
YAML	人类可读的数据格式，适合配置
JSON	网络传输标准格式
CSV	表格数据格式
Marshal	Ruby 内置二进制序列化
Pathname	面向对象的路径处理

📖 扩展阅读

上一章：← 第 13 章：模块深入 下一章：第 15 章：测试驱动开发 →