Ruby 入门指南 / 第 14 章:文件与数据
第 14 章:文件与数据
“数据是新时代的石油,文件操作是炼油的技术。”
14.1 文件读取
14.1.1 基本读取
# 一次性读取整个文件
content = File.read("data.txt")
content = File.read("data.txt", encoding: "UTF-8")
# 读取为行数组
lines = File.readlines("data.txt")
lines = File.readlines("data.txt", chomp: true) # 去除换行符
# 逐行读取(大文件推荐)
File.foreach("data.txt") do |line|
puts line.chomp
end
# 带索引的逐行读取
File.foreach("data.txt").with_index do |line, index|
puts "#{index + 1}: #{line.chomp}"
end
14.1.2 块模式读取
# 使用块确保文件关闭
File.open("data.txt", "r") do |file|
file.each_line do |line|
puts line.chomp
end
end
# 二进制读取
File.open("image.png", "rb") do |file|
data = file.read
puts "Size: #{data.bytesize} bytes"
end
14.1.3 逐块读取
# 逐块读取大文件
File.open("large_file.bin", "rb") do |file|
while (chunk = file.read(4096))
process_chunk(chunk)
end
end
# 带偏移量读取
File.open("data.txt") do |file|
file.seek(10) # 跳到第 10 字节
data = file.read(20) # 读取 20 字节
puts file.pos # 当前位置
end
14.2 文件写入
14.2.1 基本写入
# 覆盖写入
File.write("output.txt", "Hello, World!\n")
# 追加写入
File.open("log.txt", "a") do |file|
file.puts "[#{Time.now}] New entry"
end
# 追加模式简写
File.write("log.txt", "New line\n", mode: "a")
# 写入多行
File.open("data.txt", "w") do |file|
file.puts "Line 1"
file.puts "Line 2"
file.puts "Line 3"
end
14.2.2 临时文件
require "tempfile"
# 创建临时文件
Tempfile.create(["output", ".txt"]) do |file|
file.write("Temporary data")
file.rewind
puts file.read
end
# 临时文件不会自动删除
# 块模式(自动清理)
Tempfile.open(["output", ".txt"]) do |file|
file.write("Temporary data")
file.rewind
puts file.read
end
# 文件在块结束后自动删除
# 指定临时目录
Tempfile.create(["output", ".txt"], "/tmp") do |file|
file.write("Data in /tmp")
end
14.3 文件操作
14.3.1 文件信息
# 文件存在性
File.exist?("data.txt") # => true/false
File.file?("data.txt") # => true(是文件)
File.directory?("my_dir") # => true(是目录)
File.symlink?("link") # => true(是符号链接)
# 文件大小和时间
File.size("data.txt") # => 字节数
File.mtime("data.txt") # => 最后修改时间
File.atime("data.txt") # => 最后访问时间
File.ctime("data.txt") # => 创建时间(Unix)
# 文件权限
File.readable?("data.txt") # => 可读
File.writable?("data.txt") # => 可写
File.executable?("script.sh") # => 可执行
# 文件扩展名和路径
File.extname("data.txt") # => ".txt"
File.basename("dir/data.txt") # => "data.txt"
File.basename("data.txt", ".txt") # => "data"
File.dirname("dir/data.txt") # => "dir"
File.split("dir/data.txt") # => ["dir", "data.txt"]
File.expand_path("../data.txt", __FILE__) # => 绝对路径
14.3.2 目录操作
# 创建目录
Dir.mkdir("new_dir")
Dir.mkdir("new_dir", 0755) # 带权限
# 递归创建
require "fileutils"
FileUtils.mkdir_p("path/to/deep/dir")
# 删除目录
Dir.rmdir("empty_dir")
Dir.delete("empty_dir")
# 列出目录内容
Dir.entries(".") # 所有条目
Dir.children(".") # 不含 . 和 ..
Dir.glob("*") # 通配符匹配
Dir.glob("**/*.rb") # 递归匹配
Dir.glob("*.{rb,py,js}") # 多扩展名
# 遍历目录
Dir.glob("**/*.rb").each do |file|
puts file
end
# 当前目录和切换
Dir.pwd # 当前目录
Dir.chdir("/tmp") # 切换目录
Dir.home # 用户主目录
Dir.tmpdir # 临时目录
14.3.3 文件操作
require "fileutils"
# 复制文件
FileUtils.cp("source.txt", "dest.txt")
FileUtils.cp_r("source_dir", "dest_dir") # 递归复制
# 移动/重命名
FileUtils.mv("old.txt", "new.txt")
# 删除文件
FileUtils.rm("file.txt")
FileUtils.rm_rf("dir") # 递归删除(危险!)
# 创建链接
FileUtils.ln_s("target", "link") # 符号链接
# 比较文件
FileUtils.identical?("file1.txt", "file2.txt") # => true/false
# 安全写入(原子操作)
FileUtils.touch("file.txt") # 创建或更新时间戳
14.4 YAML 处理
14.4.1 读写 YAML
require "yaml"
# 读取 YAML
config = YAML.load_file("config.yml")
config = YAML.safe_load_file("config.yml", permitted_classes: [Symbol, Date, Time])
# 解析 YAML 字符串
data = YAML.parse("---\nname: Alice\nage: 25\n")
hash = YAML.load("---\nname: Alice\nage: 25\n")
# 写入 YAML
data = { name: "Alice", age: 25, hobbies: ["reading", "coding"] }
File.write("data.yml", data.to_yaml)
# YAML.dump
File.open("data.yml", "w") do |file|
YAML.dump(data, file)
end
14.4.2 YAML 配置文件
# config.yml
# database:
# adapter: postgresql
# host: localhost
# port: 5432
# name: myapp_development
#
# server:
# host: 0.0.0.0
# port: 3000
#
# logging:
# level: info
# file: log/app.log
# 加载配置
class Config
def initialize(path = "config.yml")
@data = YAML.safe_load_file(path, permitted_classes: [Symbol]) if File.exist?(path)
@data ||= {}
end
def get(key, default = nil)
keys = key.to_s.split(".")
value = keys.reduce(@data) { |hash, k| hash&.fetch(k, nil) }
value || default
end
def method_missing(name, *args)
get(name) || super
end
end
config = Config.new
config.database.host # => "localhost"
config.server.port # => 3000
14.5 JSON 处理
14.5.1 基本操作
require "json"
# 解析 JSON
json_string = '{"name": "Alice", "age": 25, "hobbies": ["reading", "coding"]}'
data = JSON.parse(json_string)
data["name"] # => "Alice"
data["hobbies"] # => ["reading", "coding"]
# 从文件读取
data = JSON.parse(File.read("data.json"))
data = JSON.load_file("data.json") # Ruby 3.1+
# 生成 JSON
hash = { name: "Alice", age: 25, active: true }
json = hash.to_json
# => '{"name":"Alice","age":25,"active":true}'
# 格式化输出
json = JSON.pretty_generate(hash)
# {
# "name": "Alice",
# "age": 25,
# "active": true
# }
# 写入文件
File.write("output.json", JSON.pretty_generate(data))
14.5.2 JSON 处理技巧
require "json"
# 嵌套数据
users = [
{
id: 1,
name: "Alice",
profile: {
email: "[email protected]",
age: 25
}
},
{
id: 2,
name: "Bob",
profile: {
email: "[email protected]",
age: 30
}
}
]
# 转为 JSON
json = users.to_json
# 解析并处理
parsed = JSON.parse(json, symbolize_names: true)
parsed.each do |user|
puts "#{user[:name]}: #{user[:profile][:email]}"
end
# 使用 JSON::Ext 加速(默认已包含)
# 可选:使用 Oj gem 获取更好性能
# require "oj"
# Oj.load(json_string)
# Oj.dump(ruby_object)
14.6 CSV 处理
14.6.1 读取 CSV
require "csv"
# 读取为数组
CSV.foreach("data.csv") do |row|
puts row.inspect # => ["name", "age", "city"]
end
# 读取为哈希(带表头)
CSV.foreach("data.csv", headers: true) do |row|
puts "#{row['name']}: #{row['age']}"
end
# 一次性读取
table = CSV.read("data.csv", headers: true)
table.each do |row|
puts row["name"]
end
# 转为数组
rows = CSV.read("data.csv") # => [["name", "age"], ["Alice", "25"], ...]
# 转为哈希数组
data = CSV.table("data.csv") # => [{name: "Alice", age: 25}, ...]
14.6.2 写入 CSV
require "csv"
# 写入为数组
CSV.open("output.csv", "w") do |csv|
csv << ["name", "age", "city"]
csv << ["Alice", 25, "Beijing"]
csv << ["Bob", 30, "Shanghai"]
end
# 字符串输出
csv_string = CSV.generate do |csv|
csv << ["name", "age"]
csv << ["Alice", 25]
csv << ["Bob", 30]
end
puts csv_string
# 使用 headers 选项写入
CSV.open("output.csv", "w", headers: %w[name age city], write_headers: true) do |csv|
csv << ["Alice", 25, "Beijing"]
csv << ["Bob", 30, "Shanghai"]
end
14.6.3 CSV 数据处理
require "csv"
# 读取并处理
users = CSV.read("users.csv", headers: true)
# 过滤
adults = users.select { |row| row["age"].to_i >= 18 }
# 转换
names = users.map { |row| row["name"] }
# 分组
by_city = users.group_by { |row| row["city"] }
# 聚合
avg_age = users.sum { |row| row["age"].to_f } / users.length
# 写出处理结果
CSV.open("adults.csv", "w", headers: users.headers, write_headers: true) do |csv|
adults.each { |row| csv << row }
end
# 自定义列分隔符
CSV.foreach("data.tsv", col_sep: "\t") do |row|
puts row.inspect
end
14.7 序列化
14.7.1 Marshal 序列化
# Marshal 是 Ruby 内置的二进制序列化
data = { name: "Alice", scores: [95, 87, 92], active: true }
# 序列化为二进制
binary = Marshal.dump(data)
File.write("data.marshal", binary)
# 反序列化
loaded = Marshal.load(File.read("data.marshal"))
loaded[:name] # => "Alice"
# 深拷贝(使用 Marshal)
original = { a: [1, 2, 3], b: { c: 4 } }
copy = Marshal.load(Marshal.dump(original))
copy[:a] << 4
original[:a] # => [1, 2, 3](不受影响)
14.7.2 自定义序列化
class User
attr_reader :name, :email, :created_at
def initialize(name, email)
@name = name
@email = email
@created_at = Time.now
end
# 自定义 Marshal 序列化
def marshal_dump
{ name: @name, email: @email, created_at: @created_at.to_s }
end
def marshal_load(data)
@name = data[:name]
@email = data[:email]
@created_at = Time.parse(data[:created_at])
end
# JSON 序列化
def to_json(*args)
{ name: @name, email: @email, created_at: @created_at.to_s }.to_json(*args)
end
def self.from_json(json_string)
data = JSON.parse(json_string, symbolize_names: true)
user = new(data[:name], data[:email])
user
end
end
user = User.new("Alice", "[email protected]")
# Marshal
binary = Marshal.dump(user)
restored = Marshal.load(binary)
# JSON
json = user.to_json
restored = User.from_json(json)
14.8 路径处理
14.8.1 Pathname
require "pathname"
# 创建 Pathname
path = Pathname.new("/home/user/data.txt")
path = Pathname("/home/user/data.txt")
# 路径信息
path.basename # => #<Pathname:data.txt>
path.basename(".txt") # => #<Pathname:data>
path.dirname # => #<Pathname:/home/user>
path.extname # => ".txt"
path.to_s # => "/home/user/data.txt"
# 路径操作
path.parent # => #<Pathname:/home/user>
path + "other.txt" # => #<Pathname:/home/user/other.txt>
path.expand_path # => 绝对路径
# 文件操作
path.exist? # => true/false
path.file? # => true
path.directory? # => false
path.read # => 文件内容
path.write("data") # 写入
path.size # => 字节数
# 目录遍历
Pathname(".").children # 当前目录子项
Pathname(".").glob("**/*.rb") # 递归查找
# 链式操作
Pathname("output")
.join("data", "results")
.expand_path
.parent
14.9 实际业务场景
14.9.1 日志处理
require "csv"
require "json"
class LogParser
def initialize(log_file)
@log_file = log_file
end
def parse
entries = []
File.foreach(@log_file) do |line|
match = line.match(/\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] (\w+) (.+)/)
if match
entries << {
time: Time.parse(match[1]),
level: match[2],
message: match[3]
}
end
end
entries
end
def to_csv(output_file)
entries = parse
CSV.open(output_file, "w", write_headers: true,
headers: %w[time level message]) do |csv|
entries.each do |entry|
csv << [entry[:time], entry[:level], entry[:message]]
end
end
end
def statistics
entries = parse
{
total: entries.length,
by_level: entries.group_by { |e| e[:level] }.transform_values(&:count),
first: entries.first&.dig(:time),
last: entries.last&.dig(:time)
}
end
end
14.9.2 配置管理器
require "yaml"
require "json"
class ConfigManager
def initialize(config_dir = "config")
@config_dir = config_dir
@config = {}
load_all
end
def get(key_path, default = nil)
keys = key_path.to_s.split(".")
value = keys.reduce(@config) { |h, k| h.is_a?(Hash) ? h[k] || h[k.to_sym] : nil }
value || default
end
private
def load_all
Dir.glob(File.join(@config_dir, "*.{yml,yaml,json}")).each do |file|
data = case File.extname(file)
when ".yml", ".yaml"
YAML.safe_load_file(file, permitted_classes: [Symbol, Date, Time])
when ".json"
JSON.parse(File.read(file))
end
@config.merge!(data || {})
end
end
end
config = ConfigManager.new
config.get("database.host") # => "localhost"
config.get("database.pool_size", 5) # => 5
14.10 动手练习
- 文件搜索工具
# 实现 grep 类似的文件搜索工具
# search_files("*.rb", /def \w+/, "/path/to/dir")
def search_files(pattern, regex, dir)
# 你的代码...
end
- CSV 转 JSON
# 将 CSV 文件转换为 JSON 格式
def csv_to_json(csv_file, json_file)
# 你的代码...
end
- 目录统计
# 统计目录中各类型文件的数量和大小
def directory_stats(dir)
# 你的代码...
end
14.11 本章小结
| 要点 | 说明 |
|---|---|
| File | 基本文件操作类 |
| FileUtils | 高级文件操作工具 |
| YAML | 人类可读的数据格式,适合配置 |
| JSON | 网络传输标准格式 |
| CSV | 表格数据格式 |
| Marshal | Ruby 内置二进制序列化 |
| Pathname | 面向对象的路径处理 |
📖 扩展阅读
上一章:← 第 13 章:模块深入 下一章:第 15 章:测试驱动开发 →