Skip to content

Ruby XML, XSLT 和 XPath 教程

XML(可扩展标记语言)是一种广泛使用的数据交换格式,在Web服务、配置文件和数据存储中都有重要应用。Ruby提供了多种处理XML的方式,包括内置的REXML库和第三方库如Nokogiri。本章将详细介绍如何在Ruby中解析、生成和操作XML数据,以及使用XPath和XSLT进行高级处理。

🎯 XML基础

什么是XML

XML(eXtensible Markup Language)是一种标记语言,设计用来存储和传输数据。它具有以下特点:

  • 自描述性:标签名称可以描述数据的含义
  • 层次结构:支持嵌套的数据结构
  • 平台无关:可在不同系统间交换数据
  • 可扩展:可以定义自己的标签

XML基本结构

xml
<?xml version="1.0" encoding="UTF-8"?>
<bookstore>
  <book id="1">
    <title>Ruby编程入门</title>
    <author>张三</author>
    <price currency="CNY">59.00</price>
    <category>编程</category>
  </book>
  <book id="2">
    <title>Web开发实战</title>
    <author>李四</author>
    <price currency="CNY">79.00</price>
    <category>Web</category>
  </book>
</bookstore>

📖 使用REXML处理XML

解析XML文档

ruby
require 'rexml/document'

# XML数据
xml_data = <<~XML
  <?xml version="1.0" encoding="UTF-8"?>
  <bookstore>
    <book id="1">
      <title>Ruby编程入门</title>
      <author>张三</author>
      <price currency="CNY">59.00</price>
      <category>编程</category>
    </book>
    <book id="2">
      <title>Web开发实战</title>
      <author>李四</author>
      <price currency="CNY">79.00</price>
      <category>Web</category>
    </book>
  </bookstore>
XML

# 解析XML
doc = REXML::Document.new(xml_data)

# 访问根元素
root = doc.root
puts "根元素: #{root.name}"  # bookstore

# 遍历子元素
root.elements.each('book') do |book|
  id = book.attributes['id']
  title = book.elements['title'].text
  author = book.elements['author'].text
  price = book.elements['price'].text
  currency = book.elements['price'].attributes['currency']
  
  puts "书籍ID: #{id}"
  puts "书名: #{title}"
  puts "作者: #{author}"
  puts "价格: #{price} #{currency}"
  puts "---"
end

生成XML文档

ruby
require 'rexml/document'

# 创建XML文档
doc = REXML::Document.new
doc << REXML::XMLDecl.new('1.0', 'UTF-8')

# 创建根元素
bookstore = doc.add_element('bookstore')

# 添加书籍元素
book1 = bookstore.add_element('book', {'id' => '1'})
book1.add_element('title').text = 'Ruby编程入门'
book1.add_element('author').text = '张三'
price1 = book1.add_element('price', {'currency' => 'CNY'})
price1.text = '59.00'
book1.add_element('category').text = '编程'

book2 = bookstore.add_element('book', {'id' => '2'})
book2.add_element('title').text = 'Web开发实战'
book2.add_element('author').text = '李四'
price2 = book2.add_element('price', {'currency' => 'CNY'})
price2.text = '79.00'
book2.add_element('category').text = 'Web'

# 输出XML
output = StringIO.new
doc.write(output, 2)  # 2表示缩进空格数
puts output.string

修改XML文档

ruby
require 'rexml/document'

# 解析现有XML
xml_data = <<~XML
  <bookstore>
    <book id="1">
      <title>旧书名</title>
      <author>张三</author>
      <price currency="CNY">59.00</price>
    </book>
  </bookstore>
XML

doc = REXML::Document.new(xml_data)

# 修改元素文本
book = doc.root.elements['book']
book.elements['title'].text = '新书名'

# 修改属性
book.elements['price'].attributes['currency'] = 'USD'

# 添加新元素
book.add_element('category').text = '编程'

# 删除元素
# book.delete_element('author')

# 输出修改后的XML
output = StringIO.new
doc.write(output, 2)
puts output.string

🔍 使用XPath查询XML

XPath基础

XPath是一种在XML文档中查找节点的语言。REXML支持XPath查询:

ruby
require 'rexml/document'
require 'rexml/xpath'

xml_data = <<~XML
  <library>
    <book category="fiction" id="1">
      <title lang="zh">小说A</title>
      <author>作者A</author>
      <year>2020</year>
      <price>29.99</price>
    </book>
    <book category="fiction" id="2">
      <title lang="en">Novel B</title>
      <author>Author B</author>
      <year>2021</year>
      <price>39.99</price>
    </book>
    <book category="technical" id="3">
      <title lang="zh">技术手册</title>
      <author>技术作者</author>
      <year>2019</year>
      <price>49.99</price>
    </book>
  </library>
XML

doc = REXML::Document.new(xml_data)

# 基本XPath查询
# 查找所有book元素
books = REXML::XPath.match(doc, '//book')
puts "书籍总数: #{books.length}"

# 查找特定属性的元素
fiction_books = REXML::XPath.match(doc, '//book[@category="fiction"]')
puts "小说类书籍: #{fiction_books.length}本"

# 查找特定ID的书籍
book1 = REXML::XPath.first(doc, '//book[@id="1"]')
puts "书籍1标题: #{book1.elements['title'].text}"

# 查找包含特定文本的元素
chinese_books = REXML::XPath.match(doc, '//book[title/@lang="zh"]')
puts "中文书籍: #{chinese_books.length}本"

# 使用轴查询
# 查找第一个book的后续兄弟元素
following_books = REXML::XPath.match(doc, '//book[@id="1"]/following-sibling::book')
puts "书籍1之后的书籍: #{following_books.length}本"

# 查找父元素
book_parent = REXML::XPath.first(doc, '//book/parent::*')
puts "书籍的父元素: #{book_parent.name}"

高级XPath查询

ruby
require 'rexml/document'
require 'rexml/xpath'

# 复杂的XML数据
xml_data = <<~XML
  <company>
    <department name="开发部">
      <employee id="001">
        <name>张三</name>
        <position>高级工程师</position>
        <salary>15000</salary>
        <skills>
          <skill>Ruby</skill>
          <skill>JavaScript</skill>
          <skill>Python</skill>
        </skills>
      </employee>
      <employee id="002">
        <name>李四</name>
        <position>初级工程师</position>
        <salary>8000</salary>
        <skills>
          <skill>Java</skill>
          <skill>SQL</skill>
        </skills>
      </employee>
    </department>
    <department name="设计部">
      <employee id="003">
        <name>王五</name>
        <position>UI设计师</position>
        <salary>12000</salary>
        <skills>
          <skill>Photoshop</skill>
          <skill>Sketch</skill>
        </skills>
      </employee>
    </department>
  </company>
XML

doc = REXML::Document.new(xml_data)

# 查询高薪员工(薪资大于10000)
high_salary_employees = REXML::XPath.match(doc, '//employee[salary > 10000]')
puts "高薪员工:"
high_salary_employees.each do |emp|
  name = emp.elements['name'].text
  salary = emp.elements['salary'].text
  puts "  #{name}: #{salary}"
end

# 查询掌握特定技能的员工
ruby_developers = REXML::XPath.match(doc, '//employee[skills/skill="Ruby"]')
puts "\nRuby开发者:"
ruby_developers.each do |emp|
  puts "  #{emp.elements['name'].text}"
end

# 查询每个部门的员工数
departments = REXML::XPath.match(doc, '//department')
puts "\n部门员工统计:"
departments.each do |dept|
  dept_name = dept.attributes['name']
  employee_count = REXML::XPath.match(dept, './/employee').length
  puts "  #{dept_name}: #{employee_count}人"
end

# 查询所有技能
all_skills = REXML::XPath.match(doc, '//skill')
unique_skills = all_skills.map { |skill| skill.text }.uniq
puts "\n所有技能: #{unique_skills.join(', ')}"

🛠️ 使用Nokogiri处理XML

安装和基本使用

Nokogiri是一个功能更强大的XML/HTML处理库,需要先安装:

bash
gem install nokogiri
ruby
require 'nokogiri'

# 解析XML
xml_data = <<~XML
  <?xml version="1.0" encoding="UTF-8"?>
  <products>
    <product id="1">
      <name>笔记本电脑</name>
      <price currency="CNY">5999.00</price>
      <description>高性能笔记本电脑</description>
      <tags>
        <tag>电子</tag>
        <tag>电脑</tag>
        <tag>办公</tag>
      </tags>
    </product>
    <product id="2">
      <name>智能手机</name>
      <price currency="CNY">3999.00</price>
      <description>最新款智能手机</description>
      <tags>
        <tag>电子</tag>
        <tag>手机</tag>
        <tag>通讯</tag>
      </tags>
    </product>
  </products>
XML

# 解析XML文档
doc = Nokogiri::XML(xml_data)

# CSS选择器查询
puts "产品总数: #{doc.css('product').length}"

# 查找特定属性的元素
product1 = doc.css('product[id="1"]').first
puts "产品1名称: #{product1.css('name').text}"

# XPath查询
expensive_products = doc.xpath('//product[price > 4000]')
puts "昂贵产品数: #{expensive_products.length}"

# 遍历元素
doc.css('product').each do |product|
  id = product['id']
  name = product.css('name').text
  price = product.css('price').text
  currency = product.css('price').first['currency']
  
  puts "产品ID: #{id}"
  puts "名称: #{name}"
  puts "价格: #{price} #{currency}"
  
  # 遍历标签
  tags = product.css('tag').map(&:text)
  puts "标签: #{tags.join(', ')}"
  puts "---"
end

修改XML文档

ruby
require 'nokogiri'

# 解析XML
xml_data = <<~XML
  <inventory>
    <item sku="A001">
      <name>商品A</name>
      <quantity>100</quantity>
      <price>29.99</price>
    </item>
  </inventory>
XML

doc = Nokogiri::XML(xml_data)

# 修改元素文本
item = doc.css('item').first
item.css('name').first.content = '新商品A'
item.css('quantity').first.content = '150'

# 修改属性
item['sku'] = 'A001-NEW'

# 添加新元素
description = Nokogiri::XML::Node.new('description', doc)
description.content = '这是商品A的描述'
item.add_child(description)

# 添加子元素
category = item.add_child('<category>电子</category>')

# 删除元素
# item.css('description').remove

# 输出修改后的XML
puts doc.to_xml(indent: 2)

处理HTML

ruby
require 'nokogiri'

# 解析HTML
html_data = <<~HTML
  <!DOCTYPE html>
  <html>
    <head>
      <title>商品页面</title>
    </head>
    <body>
      <div class="products">
        <div class="product" data-id="1">
          <h2>笔记本电脑</h2>
          <p class="price">¥5999.00</p>
          <p class="description">高性能笔记本电脑</p>
          <button class="add-to-cart">加入购物车</button>
        </div>
        <div class="product" data-id="2">
          <h2>智能手机</h2>
          <p class="price">¥3999.00</p>
          <p class="description">最新款智能手机</p>
          <button class="add-to-cart">加入购物车</button>
        </div>
      </div>
    </body>
  </html>
HTML

doc = Nokogiri::HTML(html_data)

# CSS选择器查询
puts "商品总数: #{doc.css('.product').length}"

# 查找特定商品
first_product = doc.css('.product').first
puts "第一个商品标题: #{first_product.css('h2').text}"

# 获取数据属性
product_ids = doc.css('.product').map { |p| p['data-id'] }
puts "商品ID: #{product_ids}"

# 查找价格
prices = doc.css('.price').map(&:text)
puts "价格列表: #{prices}"

# 修改HTML内容
doc.css('title').first.content = '新商品页面'
doc.css('.product').first.css('h2').first.content = 'MacBook Pro'

# 添加新元素
new_product = Nokogiri::XML::Node.new('div', doc)
new_product['class'] = 'product'
new_product['data-id'] = '3'
new_product.inner_html = <<~HTML
  <h2>平板电脑</h2>
  <p class="price">¥2999.00</p>
  <p class="description">轻薄便携平板电脑</p>
  <button class="add-to-cart">加入购物车</button>
HTML

doc.css('.products').first.add_child(new_product)

puts doc.to_html

🎨 XSLT转换

XSLT基础

XSLT(可扩展样式表语言转换)用于将XML文档转换为其他格式:

ruby
require 'nokogiri'

# XML数据
xml_data = <<~XML
  <?xml version="1.0" encoding="UTF-8"?>
  <catalog>
    <product id="1">
      <name>笔记本电脑</name>
      <price currency="CNY">5999.00</price>
      <category>电脑</category>
    </product>
    <product id="2">
      <name>智能手机</name>
      <price currency="CNY">3999.00</price>
      <category>手机</category>
    </product>
  </catalog>
XML

# XSLT样式表
xslt_data = <<~XSLT
  <?xml version="1.0" encoding="UTF-8"?>
  <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="html" indent="yes"/>
    
    <xsl:template match="/">
      <html>
        <head>
          <title>产品目录</title>
          <style>
            table { border-collapse: collapse; width: 100%; }
            th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
            th { background-color: #f2f2f2; }
          </style>
        </head>
        <body>
          <h1>产品目录</h1>
          <table>
            <tr>
              <th>ID</th>
              <th>名称</th>
              <th>价格</th>
              <th>类别</th>
            </tr>
            <xsl:for-each select="catalog/product">
              <tr>
                <td><xsl:value-of select="@id"/></td>
                <td><xsl:value-of select="name"/></td>
                <td><xsl:value-of select="price"/> <xsl:value-of select="price/@currency"/></td>
                <td><xsl:value-of select="category"/></td>
              </tr>
            </xsl:for-each>
          </table>
        </body>
      </html>
    </xsl:template>
  </xsl:stylesheet>
XSLT

# 解析XML和XSLT
xml_doc = Nokogiri::XML(xml_data)
xslt_doc = Nokogiri::XSLT(xslt_data)

# 执行转换
result = xslt_doc.transform(xml_doc)
puts result.to_html

复杂XSLT转换

ruby
require 'nokogiri'

# 复杂的XML数据
xml_data = <<~XML
  <?xml version="1.0" encoding="UTF-8"?>
  <library>
    <book isbn="978-0123456789">
      <title>编程语言设计</title>
      <authors>
        <author>
          <name>张三</name>
          <role>主编</role>
        </author>
        <author>
          <name>李四</name>
          <role>副主编</role>
        </author>
      </authors>
      <publisher>科技出版社</publisher>
      <publication_date>2023-01-15</publication_date>
      <pages>450</pages>
      <price currency="CNY">89.00</price>
      <tags>
        <tag>编程</tag>
        <tag>语言</tag>
        <tag>设计</tag>
      </tags>
    </book>
  </library>
XML

# 复杂的XSLT样式表
xslt_data = <<~XSLT
  <?xml version="1.0" encoding="UTF-8"?>
  <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="text" indent="no"/>
    
    <xsl:template match="/">
      <xsl:for-each select="library/book">
        <xsl:text>=== 图书信息 ===
</xsl:text>
        <xsl:text>书名: </xsl:text>
        <xsl:value-of select="title"/>
        <xsl:text>
ISBN: </xsl:text>
        <xsl:value-of select="@isbn"/>
        <xsl:text>
作者: </xsl:text>
        <xsl:for-each select="authors/author">
          <xsl:value-of select="name"/>
          <xsl:if test="role">
            <xsl:text> (</xsl:text>
            <xsl:value-of select="role"/>
            <xsl:text>)</xsl:text>
          </xsl:if>
          <xsl:if test="position() != last()">
            <xsl:text>, </xsl:text>
          </xsl:if>
        </xsl:for-each>
        <xsl:text>
出版社: </xsl:text>
        <xsl:value-of select="publisher"/>
        <xsl:text>
出版日期: </xsl:text>
        <xsl:value-of select="publication_date"/>
        <xsl:text>
页数: </xsl:text>
        <xsl:value-of select="pages"/>
        <xsl:text>
价格: </xsl:text>
        <xsl:value-of select="price"/>
        <xsl:text> </xsl:text>
        <xsl:value-of select="price/@currency"/>
        <xsl:text>
标签: </xsl:text>
        <xsl:for-each select="tags/tag">
          <xsl:value-of select="."/>
          <xsl:if test="position() != last()">
            <xsl:text>, </xsl:text>
          </xsl:if>
        </xsl:for-each>
        <xsl:text>

</xsl:text>
      </xsl:for-each>
    </xsl:template>
  </xsl:stylesheet>
XSLT

# 执行转换
xml_doc = Nokogiri::XML(xml_data)
xslt_doc = Nokogiri::XSLT(xslt_data)
result = xslt_doc.transform(xml_doc)
puts result.text

🎯 XML实用示例

配置文件处理

ruby
require 'nokogiri'

# 应用配置XML
config_xml = <<~XML
  <?xml version="1.0" encoding="UTF-8"?>
  <configuration>
    <database>
      <host>localhost</host>
      <port>5432</port>
      <name>myapp_db</name>
      <username>admin</username>
      <password>secret</password>
    </database>
    <logging>
      <level>INFO</level>
      <file>/var/log/myapp.log</file>
      <max_size>10MB</max_size>
    </logging>
    <features>
      <feature name="user_management" enabled="true"/>
      <feature name="reporting" enabled="false"/>
      <feature name="analytics" enabled="true"/>
    </features>
  </configuration>
XML

class ConfigManager
  def initialize(config_xml)
    @doc = Nokogiri::XML(config_xml)
  end
  
  def database_config
    db = @doc.css('database').first
    {
      host: db.css('host').text,
      port: db.css('port').text.to_i,
      name: db.css('name').text,
      username: db.css('username').text,
      password: db.css('password').text
    }
  end
  
  def logging_config
    log = @doc.css('logging').first
    {
      level: log.css('level').text,
      file: log.css('file').text,
      max_size: log.css('max_size').text
    }
  end
  
  def feature_enabled?(feature_name)
    feature = @doc.xpath("//feature[@name='#{feature_name}']").first
    feature && feature['enabled'] == 'true'
  end
  
  def update_database_host(new_host)
    @doc.css('database host').first.content = new_host
  end
  
  def to_xml
    @doc.to_xml(indent: 2)
  end
end

# 使用配置管理器
config = ConfigManager.new(config_xml)

puts "数据库配置:"
db_config = config.database_config
puts "  主机: #{db_config[:host]}"
puts "  端口: #{db_config[:port]}"
puts "  数据库名: #{db_config[:name]}"

puts "\n日志配置:"
log_config = config.logging_config
puts "  级别: #{log_config[:level]}"
puts "  文件: #{log_config[:file]}"

puts "\n功能状态:"
puts "  用户管理: #{config.feature_enabled?('user_management')}"
puts "  报告功能: #{config.feature_enabled?('reporting')}"
puts "  分析功能: #{config.feature_enabled?('analytics')}"

# 更新配置
config.update_database_host('newhost.example.com')
puts "\n更新后的配置:"
puts config.to_xml

RSS订阅解析

ruby
require 'nokogiri'

# 模拟RSS数据
rss_data = <<~RSS
  <?xml version="1.0" encoding="UTF-8"?>
  <rss version="2.0">
    <channel>
      <title>技术博客</title>
      <link>https://example.com</link>
      <description>最新的技术文章</description>
      <language>zh-CN</language>
      <item>
        <title>Ruby 3.0新特性介绍</title>
        <link>https://example.com/ruby-3-0</link>
        <description>Ruby 3.0带来了许多令人兴奋的新特性...</description>
        <pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
        <guid>https://example.com/ruby-3-0</guid>
      </item>
      <item>
        <title>Web开发最佳实践</title>
        <link>https://example.com/web-best-practices</link>
        <description>在现代Web开发中,遵循最佳实践至关重要...</description>
        <pubDate>Sun, 31 Dec 2023 10:00:00 GMT</pubDate>
        <guid>https://example.com/web-best-practices</guid>
      </item>
    </channel>
  </rss>
RSS

class RSSParser
  def initialize(rss_xml)
    @doc = Nokogiri::XML(rss_xml)
  end
  
  def channel_info
    channel = @doc.css('channel').first
    {
      title: channel.css('title').text,
      link: channel.css('link').text,
      description: channel.css('description').text,
      language: channel.css('language').text
    }
  end
  
  def items
    @doc.css('item').map do |item|
      {
        title: item.css('title').text,
        link: item.css('link').text,
        description: item.css('description').text,
        pub_date: Time.parse(item.css('pubDate').text),
        guid: item.css('guid').text
      }
    end
  end
  
  def latest_items(count = 5)
    items.first(count)
  end
  
  def items_by_date
    items.sort { |a, b| b[:pub_date] <=> a[:pub_date] }
  end
end

# 使用RSS解析器
rss = RSSParser.new(rss_data)

puts "频道信息:"
channel = rss.channel_info
puts "  标题: #{channel[:title]}"
puts "  链接: #{channel[:link]}"
puts "  描述: #{channel[:description]}"

puts "\n最新文章:"
latest = rss.latest_items(2)
latest.each do |item|
  puts "  标题: #{item[:title]}"
  puts "  发布时间: #{item[:pub_date]}"
  puts "  链接: #{item[:link]}"
  puts "  摘要: #{item[:description][0, 50]}..."
  puts
end

SOAP Web服务客户端

ruby
require 'nokogiri'

# 模拟SOAP响应
soap_response = <<~SOAP
  <?xml version="1.0" encoding="UTF-8"?>
  <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body>
      <GetWeatherResponse xmlns="http://example.com/weather">
        <City>北京</City>
        <Temperature>25</Temperature>
        <Humidity>60</Humidity>
        <Condition>晴朗</Condition>
        <Forecast>
          <Day date="2024-01-02">
            <High>27</High>
            <Low>18</Low>
            <Condition>多云</Condition>
          </Day>
          <Day date="2024-01-03">
            <High>23</High>
            <Low>15</Low>
            <Condition>小雨</Condition>
          </Day>
        </Forecast>
      </GetWeatherResponse>
    </soap:Body>
  </soap:Envelope>
SOAP

class WeatherServiceClient
  def initialize(soap_response)
    @doc = Nokogiri::XML(soap_response)
  end
  
  def current_weather
    response = @doc.css('GetWeatherResponse').first
    {
      city: response.css('City').text,
      temperature: response.css('Temperature').text.to_i,
      humidity: response.css('Humidity').text.to_i,
      condition: response.css('Condition').text
    }
  end
  
  def forecast
    forecast_days = @doc.css('Day').map do |day|
      {
        date: day['date'],
        high: day.css('High').text.to_i,
        low: day.css('Low').text.to_i,
        condition: day.css('Condition').text
      }
    end
  end
  
  def weather_report
    current = current_weather
    forecast_data = forecast
    
    report = "=== 天气报告 ===\n"
    report += "城市: #{current[:city]}\n"
    report += "当前温度: #{current[:temperature]}°C\n"
    report += "湿度: #{current[:humidity]}%\n"
    report += "天气状况: #{current[:condition]}\n\n"
    
    report += "天气预报:\n"
    forecast_data.each do |day|
      report += "#{day[:date]}: #{day[:low]}°C - #{day[:high]}°C, #{day[:condition]}\n"
    end
    
    report
  end
end

# 使用天气服务客户端
client = WeatherServiceClient.new(soap_response)
puts client.weather_report

📊 XML性能优化

大型XML文件处理

ruby
require 'nokogiri'

# 处理大型XML文件的技巧
class LargeXMLProcessor
  # 流式解析大型XML文件
  def self.stream_parse(file_path)
    File.open(file_path) do |file|
      Nokogiri::XML::Reader.from_io(file).each do |node|
        if node.name == 'record' && node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
          # 处理每个记录
          process_record(node.outer_xml)
        end
      end
    end
  end
  
  private
  
  def self.process_record(record_xml)
    # 解析单个记录
    doc = Nokogiri::XML(record_xml)
    record = doc.root
    
    id = record['id']
    name = record.css('name').text
    value = record.css('value').text
    
    # 处理记录数据
    puts "处理记录: ID=#{id}, Name=#{name}, Value=#{value}"
  end
end

# 创建测试数据
test_xml = <<~XML
  <?xml version="1.0" encoding="UTF-8"?>
  <records>
    <record id="1">
      <name>记录1</name>
      <value>值1</value>
    </record>
    <record id="2">
      <name>记录2</name>
      <value>值2</value>
    </record>
    <!-- 更多记录... -->
  </records>
XML

# 写入测试文件
File.write('large_data.xml', test_xml)

# 流式处理
# LargeXMLProcessor.stream_parse('large_data.xml')

XML缓存机制

ruby
require 'nokogiri'
require 'digest'

class XMLCache
  def initialize(cache_dir = './xml_cache')
    @cache_dir = cache_dir
    Dir.mkdir(@cache_dir) unless Dir.exist?(@cache_dir)
  end
  
  def parse_with_cache(xml_content)
    # 计算内容的哈希值
    hash = Digest::MD5.hexdigest(xml_content)
    cache_file = File.join(@cache_dir, "#{hash}.cache")
    
    # 检查缓存
    if File.exist?(cache_file) && File.mtime(cache_file) > Time.now - 3600
      # 从缓存加载
      puts "从缓存加载XML"
      Marshal.load(File.read(cache_file))
    else
      # 解析并缓存
      puts "解析XML并缓存"
      doc = Nokogiri::XML(xml_content)
      File.write(cache_file, Marshal.dump(doc))
      doc
    end
  end
end

# 使用缓存
cache = XMLCache.new
xml_data = "<root><item>数据</item></root>"

# 第一次解析
doc1 = cache.parse_with_cache(xml_data)

# 第二次解析(从缓存)
doc2 = cache.parse_with_cache(xml_data)

🎯 XML最佳实践

1. 错误处理

ruby
require 'nokogiri'

class SafeXMLParser
  def self.parse(xml_string)
    begin
      doc = Nokogiri::XML(xml_string) do |config|
        config.strict.noblanks
      end
      
      # 检查解析错误
      if doc.errors.any?
        puts "XML解析警告:"
        doc.errors.each { |error| puts "  #{error}" }
      end
      
      doc
    rescue Nokogiri::XML::SyntaxError => e
      puts "XML语法错误: #{e.message}"
      nil
    rescue => e
      puts "解析错误: #{e.message}"
      nil
    end
  end
end

# 使用安全解析器
valid_xml = "<root><item>有效数据</item></root>"
invalid_xml = "<root><item>无效数据</item>"  # 缺少结束标签

puts "解析有效XML:"
doc1 = SafeXMLParser.parse(valid_xml)
puts "成功" if doc1

puts "\n解析无效XML:"
doc2 = SafeXMLParser.parse(invalid_xml)
puts "失败" unless doc2

2. XML验证

ruby
require 'nokogiri'

# XML Schema (XSD)
xsd_content = <<~XSD
  <?xml version="1.0" encoding="UTF-8"?>
  <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="book">
      <xs:complexType>
        <xs:sequence>
          <xs:element name="title" type="xs:string"/>
          <xs:element name="author" type="xs:string"/>
          <xs:element name="price" type="xs:decimal"/>
        </xs:sequence>
        <xs:attribute name="id" type="xs:string" use="required"/>
      </xs:complexType>
    </xs:element>
  </xs:schema>
XSD

xml_content = <<~XML
  <book id="1">
    <title>Ruby编程</title>
    <author>张三</author>
    <price>59.99</price>
  </book>
XML

class XMLValidator
  def self.validate_with_xsd(xml_content, xsd_content)
    begin
      xsd = Nokogiri::XML::Schema(xsd_content)
      doc = Nokogiri::XML(xml_content)
      
      errors = xsd.validate(doc)
      if errors.empty?
        puts "XML验证通过"
        true
      else
        puts "XML验证失败:"
        errors.each { |error| puts "  #{error}" }
        false
      end
    rescue => e
      puts "验证错误: #{e.message}"
      false
    end
  end
end

# 验证XML
XMLValidator.validate_with_xsd(xml_content, xsd_content)

3. 安全处理

ruby
require 'nokogiri'

class SecureXMLHandler
  # 防止XML外部实体攻击(XXE)
  def self.safe_parse(xml_content)
    # 禁用外部实体
    doc = Nokogiri::XML(xml_content) do |config|
      config.nonet  # 禁止网络访问
      config.noblanks
      config.strict
    end
    
    # 移除潜在的危险元素
    doc.search('!ENTITY').remove
    doc.search('!DOCTYPE').remove
    
    doc
  end
  
  # 清理XML内容
  def self.sanitize_xml(xml_content)
    # 移除潜在的恶意内容
    sanitized = xml_content.gsub(/<!\[CDATA\[.*?\]\]>/m, '')
    sanitized = sanitized.gsub(/<!ENTITY.*?>/, '')
    sanitized = sanitized.gsub(/<!DOCTYPE.*?>/, '')
    
    sanitized
  end
end

# 安全处理XML
unsafe_xml = <<~XML
  <?xml version="1.0" encoding="UTF-8"?>
  <!DOCTYPE foo [ <!ENTITY xxe SYSTEM "file:///etc/passwd" > ]>
  <root>&xxe;</root>
XML

puts "原始XML:"
puts unsafe_xml

puts "\n清理后的XML:"
safe_xml = SecureXMLHandler.sanitize_xml(unsafe_xml)
puts safe_xml

puts "\n安全解析:"
doc = SecureXMLHandler.safe_parse(safe_xml)
puts doc.to_xml

📚 下一步学习

掌握了Ruby XML处理后,建议继续学习:

继续您的Ruby学习之旅吧!

本站内容仅供学习和研究使用。