Skip to content

Scala 正则表达式

正则表达式是处理文本和字符串匹配的强大工具。Scala 提供了丰富的正则表达式支持,包括模式匹配集成和便利的 API。

正则表达式基础

创建正则表达式

scala
import scala.util.matching.Regex

object RegexBasics {
  def main(args: Array[String]): Unit = {
    // 创建正则表达式的不同方式
    val pattern1: Regex = "\\d+".r  // 使用 .r 方法
    val pattern2: Regex = new Regex("\\d+")  // 使用构造函数
    val pattern3: Regex = """(\d{4})-(\d{2})-(\d{2})""".r  // 原始字符串
    
    // 带标志的正则表达式
    val caseInsensitive: Regex = "(?i)hello".r
    val multiline: Regex = "(?m)^start".r
    
    println("正则表达式创建成功")
    
    // 基本匹配
    val text = "The year 2023 has 365 days"
    val numberPattern = "\\d+".r
    
    // 查找第一个匹配
    val firstMatch = numberPattern.findFirstIn(text)
    println(s"第一个数字: ${firstMatch.getOrElse("未找到")}")
    
    // 查找所有匹配
    val allMatches = numberPattern.findAllIn(text).toList
    println(s"所有数字: $allMatches")
    
    // 检查是否匹配
    val hasNumbers = numberPattern.findFirstIn(text).isDefined
    println(s"包含数字: $hasNumbers")
  }
}

基本匹配操作

scala
object BasicMatching {
  def main(args: Array[String]): Unit = {
    val emailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
    val phonePattern = """\d{3}-\d{3}-\d{4}""".r
    val datePattern = """\d{4}-\d{2}-\d{2}""".r
    
    val text = """
      联系信息:
      邮箱: john.doe@example.com, alice@company.org
      电话: 555-123-4567, 555-987-6543
      日期: 2023-12-25, 2024-01-01
    """
    
    // 查找所有邮箱
    val emails = emailPattern.findAllIn(text).toList
    println(s"邮箱地址: ${emails.mkString(", ")}")
    
    // 查找所有电话
    val phones = phonePattern.findAllIn(text).toList
    println(s"电话号码: ${phones.mkString(", ")}")
    
    // 查找所有日期
    val dates = datePattern.findAllIn(text).toList
    println(s"日期: ${dates.mkString(", ")}")
    
    // 使用 findAllMatchIn 获取更多信息
    println("\n详细匹配信息:")
    emailPattern.findAllMatchIn(text).foreach { m =>
      println(s"邮箱: '${m.matched}' 位置: ${m.start}-${m.end}")
    }
  }
}

捕获组

基本捕获组

scala
object CaptureGroups {
  def main(args: Array[String]): Unit = {
    // 定义带捕获组的正则表达式
    val namePattern = """(\w+)\s+(\w+)""".r  // 名字和姓氏
    val emailPattern = """(\w+)@(\w+\.\w+)""".r  // 用户名和域名
    val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r  // 年月日
    
    val text = "John Smith的邮箱是john@example.com,注册日期是2023-12-25"
    
    // 提取名字
    namePattern.findFirstMatchIn("John Smith") match {
      case Some(m) =>
        println(s"名字: ${m.group(1)}")
        println(s"姓氏: ${m.group(2)}")
        println(s"完整匹配: ${m.group(0)}")
      case None =>
        println("未找到名字模式")
    }
    
    // 提取邮箱信息
    emailPattern.findFirstMatchIn(text) match {
      case Some(m) =>
        println(s"用户名: ${m.group(1)}")
        println(s"域名: ${m.group(2)}")
      case None =>
        println("未找到邮箱")
    }
    
    // 提取日期信息
    datePattern.findFirstMatchIn(text) match {
      case Some(m) =>
        println(s"年: ${m.group(1)}")
        println(s"月: ${m.group(2)}")
        println(s"日: ${m.group(3)}")
      case None =>
        println("未找到日期")
    }
    
    // 使用 unapplySeq 进行模式匹配
    val dateString = "2023-12-25"
    dateString match {
      case datePattern(year, month, day) =>
        println(s"模式匹配 - 年: $year, 月: $month, 日: $day")
      case _ =>
        println("日期格式不匹配")
    }
  }
}

命名捕获组

scala
object NamedCaptureGroups {
  def main(args: Array[String]): Unit = {
    // Java 风格的命名捕获组(Scala 2.13+)
    val logPattern = """(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?<level>\w+)\] (?<message>.+)""".r
    
    val logEntry = "2023-12-25 10:30:45 [INFO] Application started successfully"
    
    logPattern.findFirstMatchIn(logEntry) match {
      case Some(m) =>
        // 使用组名访问
        println(s"时间戳: ${m.group("timestamp")}")
        println(s"级别: ${m.group("level")}")
        println(s"消息: ${m.group("message")}")
      case None =>
        println("日志格式不匹配")
    }
    
    // 替代方案:使用普通捕获组和 case class
    case class LogEntry(timestamp: String, level: String, message: String)
    
    val simpleLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)""".r
    
    def parseLogEntry(log: String): Option[LogEntry] = log match {
      case simpleLogPattern(timestamp, level, message) =>
        Some(LogEntry(timestamp, level, message))
      case _ => None
    }
    
    parseLogEntry(logEntry) match {
      case Some(entry) =>
        println(s"解析的日志: $entry")
      case None =>
        println("无法解析日志")
    }
  }
}

字符串替换

基本替换操作

scala
object StringReplacement {
  def main(args: Array[String]): Unit = {
    val text = "The quick brown fox jumps over the lazy dog. The fox is quick."
    
    // 简单替换
    val pattern1 = "fox".r
    val replaced1 = pattern1.replaceAllIn(text, "cat")
    println(s"替换 fox -> cat: $replaced1")
    
    // 替换第一个匹配
    val replaced2 = pattern1.replaceFirstIn(text, "wolf")
    println(s"替换第一个 fox -> wolf: $replaced2")
    
    // 使用捕获组进行替换
    val phonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
    val phoneText = "电话号码: 555-123-4567 和 555-987-6543"
    val formattedPhones = phonePattern.replaceAllIn(phoneText, "($1) $2-$3")
    println(s"格式化电话: $formattedPhones")
    
    // 使用函数进行替换
    val numberPattern = """\d+""".r
    val numberText = "我有5个苹果和10个橙子"
    val doubledNumbers = numberPattern.replaceAllIn(numberText, m => (m.matched.toInt * 2).toString)
    println(s"数字翻倍: $doubledNumbers")
    
    // 条件替换
    val wordPattern = """\b\w+\b""".r
    val conditionalReplace = wordPattern.replaceAllIn(text, m => 
      if (m.matched.length > 4) m.matched.toUpperCase else m.matched
    )
    println(s"长单词大写: $conditionalReplace")
  }
}

高级替换技巧

scala
object AdvancedReplacement {
  def main(args: Array[String]): Unit = {
    // HTML 标签清理
    val htmlText = "<p>这是一个<strong>重要</strong>的<em>消息</em>。</p>"
    val htmlTagPattern = """<[^>]+>""".r
    val cleanText = htmlTagPattern.replaceAllIn(htmlText, "")
    println(s"清理HTML: $cleanText")
    
    // URL 转换为链接
    val urlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
    val textWithUrls = "访问 https://www.example.com 或 http://blog.example.org/post"
    val linkedText = urlPattern.replaceAllIn(textWithUrls, m => s"<a href='${m.matched}'>${m.matched}</a>")
    println(s"URL转链接: $linkedText")
    
    // 敏感信息脱敏
    val creditCardPattern = """(\d{4})-(\d{4})-(\d{4})-(\d{4})""".r
    val sensitiveText = "信用卡号: 1234-5678-9012-3456"
    val maskedText = creditCardPattern.replaceAllIn(sensitiveText, "$1-****-****-$4")
    println(s"信用卡脱敏: $maskedText")
    
    // 日期格式转换
    val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r
    val dateText = "会议日期: 2023-12-25"
    val reformattedDate = datePattern.replaceAllIn(dateText, "$3/$2/$1")
    println(s"日期格式转换: $reformattedDate")
    
    // 多步骤替换
    def cleanAndFormat(text: String): String = {
      val step1 = """[^\w\s]""".r.replaceAllIn(text, "")  // 移除标点
      val step2 = """\s+""".r.replaceAllIn(step1, " ")     // 合并空格
      val step3 = step2.trim.toLowerCase                    // 转小写并去空格
      step3
    }
    
    val messyText = "  Hello,   World!!!   How are you???  "
    println(s"清理格式化: '${cleanAndFormat(messyText)}'")
  }
}

模式匹配集成

正则表达式与模式匹配

scala
object RegexPatternMatching {
  // 定义各种模式
  val EmailPattern = """(\w+)@(\w+\.\w+)""".r
  val PhonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
  val DatePattern = """(\d{4})-(\d{2})-(\d{2})""".r
  val TimePattern = """(\d{2}):(\d{2}):(\d{2})""".r
  val IpPattern = """(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})""".r
  
  def classifyInput(input: String): String = input.trim match {
    case EmailPattern(user, domain) => 
      s"邮箱地址: 用户名=$user, 域名=$domain"
    
    case PhonePattern(area, exchange, number) => 
      s"电话号码: 区号=$area, 交换机=$exchange, 号码=$number"
    
    case DatePattern(year, month, day) => 
      s"日期: $year年$month月$day日"
    
    case TimePattern(hour, minute, second) => 
      s"时间: $hour:$minute:$second"
    
    case IpPattern(a, b, c, d) => 
      s"IP地址: $a.$b.$c.$d"
    
    case _ => 
      s"未识别的格式: $input"
  }
  
  def validateAndExtract(input: String): Option[Map[String, String]] = input match {
    case EmailPattern(user, domain) => 
      Some(Map("type" -> "email", "user" -> user, "domain" -> domain))
    
    case PhonePattern(area, exchange, number) => 
      Some(Map("type" -> "phone", "area" -> area, "exchange" -> exchange, "number" -> number))
    
    case DatePattern(year, month, day) if isValidDate(year.toInt, month.toInt, day.toInt) => 
      Some(Map("type" -> "date", "year" -> year, "month" -> month, "day" -> day))
    
    case _ => None
  }
  
  def isValidDate(year: Int, month: Int, day: Int): Boolean = {
    month >= 1 && month <= 12 && day >= 1 && day <= 31
  }
  
  def main(args: Array[String]): Unit = {
    val inputs = List(
      "john@example.com",
      "555-123-4567",
      "2023-12-25",
      "14:30:45",
      "192.168.1.1",
      "invalid-input",
      "2023-13-45"  // 无效日期
    )
    
    println("输入分类:")
    inputs.foreach(input => println(s"$input -> ${classifyInput(input)}"))
    
    println("\n验证和提取:")
    inputs.foreach { input =>
      validateAndExtract(input) match {
        case Some(data) => println(s"$input -> 有效: $data")
        case None => println(s"$input -> 无效")
      }
    }
  }
}
```## 
实际应用示例

### 日志分析器

```scala
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter

object LogAnalyzer {
  // 不同类型的日志模式
  val ApacheLogPattern = """(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r
  val ApplicationLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (\w+): (.+)""".r
  val ErrorLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ERROR (.+) - (.+)""".r
  
  case class ApacheLogEntry(
    ip: String,
    timestamp: String,
    method: String,
    url: String,
    protocol: String,
    status: Int,
    size: Int
  )
  
  case class ApplicationLogEntry(
    timestamp: String,
    level: String,
    logger: String,
    message: String
  )
  
  case class ErrorLogEntry(
    timestamp: String,
    location: String,
    message: String
  )
  
  def parseLogLine(line: String): Option[Any] = line match {
    case ApacheLogPattern(ip, timestamp, method, url, protocol, status, size) =>
      Some(ApacheLogEntry(ip, timestamp, method, url, protocol, status.toInt, size.toInt))
    
    case ApplicationLogPattern(timestamp, level, logger, message) =>
      Some(ApplicationLogEntry(timestamp, level, logger, message))
    
    case ErrorLogPattern(timestamp, location, message) =>
      Some(ErrorLogEntry(timestamp, location, message))
    
    case _ => None
  }
  
  def analyzeApacheLogs(logs: List[ApacheLogEntry]): Unit = {
    println("Apache 日志分析:")
    
    // 状态码统计
    val statusCounts = logs.groupBy(_.status).view.mapValues(_.size).toMap
    println(s"状态码分布: $statusCounts")
    
    // 最常访问的URL
    val urlCounts = logs.groupBy(_.url).view.mapValues(_.size).toMap
    val topUrls = urlCounts.toSeq.sortBy(-_._2).take(5)
    println("最常访问的URL:")
    topUrls.foreach { case (url, count) => println(s"  $url: $count 次") }
    
    // IP地址统计
    val ipCounts = logs.groupBy(_.ip).view.mapValues(_.size).toMap
    val topIps = ipCounts.toSeq.sortBy(-_._2).take(5)
    println("最活跃的IP:")
    topIps.foreach { case (ip, count) => println(s"  $ip: $count 次") }
  }
  
  def main(args: Array[String]): Unit = {
    val sampleLogs = List(
      """192.168.1.1 - - [25/Dec/2023:10:00:00 +0000] "GET /index.html HTTP/1.1" 200 1234""",
      """192.168.1.2 - - [25/Dec/2023:10:01:00 +0000] "POST /api/users HTTP/1.1" 201 567""",
      """2023-12-25 10:00:00 [INFO] UserService: User login successful""",
      """2023-12-25 10:01:00 ERROR DatabaseConnection - Connection timeout""",
      """192.168.1.1 - - [25/Dec/2023:10:02:00 +0000] "GET /about.html HTTP/1.1" 200 890"""
    )
    
    val parsedLogs = sampleLogs.flatMap(parseLogLine)
    
    println("解析的日志条目:")
    parsedLogs.foreach(println)
    
    // 分析Apache日志
    val apacheLogs = parsedLogs.collect { case log: ApacheLogEntry => log }
    if (apacheLogs.nonEmpty) {
      analyzeApacheLogs(apacheLogs)
    }
  }
}

文本处理工具

scala
object TextProcessor {
  // 各种文本模式
  val UrlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
  val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
  val HashtagPattern = """#\w+""".r
  val MentionPattern = """@\w+""".r
  val PhonePattern = """\b\d{3}-\d{3}-\d{4}\b""".r
  
  case class TextAnalysis(
    urls: List[String],
    emails: List[String],
    hashtags: List[String],
    mentions: List[String],
    phones: List[String],
    wordCount: Int,
    characterCount: Int
  )
  
  def analyzeText(text: String): TextAnalysis = {
    TextAnalysis(
      urls = UrlPattern.findAllIn(text).toList,
      emails = EmailPattern.findAllIn(text).toList,
      hashtags = HashtagPattern.findAllIn(text).toList,
      mentions = MentionPattern.findAllIn(text).toList,
      phones = PhonePattern.findAllIn(text).toList,
      wordCount = """\b\w+\b""".r.findAllIn(text).length,
      characterCount = text.length
    )
  }
  
  def extractQuotes(text: String): List[String] = {
    val quotePattern = """"([^"]+)"""".r
    quotePattern.findAllMatchIn(text).map(_.group(1)).toList
  }
  
  def highlightKeywords(text: String, keywords: List[String]): String = {
    keywords.foldLeft(text) { (result, keyword) =>
      val pattern = s"(?i)\\b$keyword\\b".r
      pattern.replaceAllIn(result, m => s"**${m.matched}**")
    }
  }
  
  def censorProfanity(text: String, profanityList: List[String]): String = {
    profanityList.foldLeft(text) { (result, word) =>
      val pattern = s"(?i)\\b$word\\b".r
      pattern.replaceAllIn(result, "*" * word.length)
    }
  }
  
  def extractCodeBlocks(text: String): List[String] = {
    val codeBlockPattern = """```(\w+)?\n(.*?)\n```""".r
    codeBlockPattern.findAllMatchIn(text).map(_.group(2)).toList
  }
  
  def formatMarkdown(text: String): String = {
    var result = text
    
    // 粗体
    result = """\*\*([^*]+)\*\*""".r.replaceAllIn(result, "<strong>$1</strong>")
    
    // 斜体
    result = """\*([^*]+)\*""".r.replaceAllIn(result, "<em>$1</em>")
    
    // 链接
    result = """\[([^\]]+)\]\(([^)]+)\)""".r.replaceAllIn(result, "<a href='$2'>$1</a>")
    
    // 代码
    result = """`([^`]+)`""".r.replaceAllIn(result, "<code>$1</code>")
    
    result
  }
  
  def main(args: Array[String]): Unit = {
    val sampleText = """
      查看我的网站 https://www.example.com 或发邮件到 contact@example.com
      关注我 @username 并使用标签 #scala #programming
      电话: 555-123-4567
      "这是一个引用的文本"
      **粗体文本** 和 *斜体文本*
      [链接文本](https://link.com)
      `代码片段`
    """.trim
    
    // 文本分析
    val analysis = analyzeText(sampleText)
    println("文本分析结果:")
    println(s"URLs: ${analysis.urls}")
    println(s"邮箱: ${analysis.emails}")
    println(s"标签: ${analysis.hashtags}")
    println(s"提及: ${analysis.mentions}")
    println(s"电话: ${analysis.phones}")
    println(s"单词数: ${analysis.wordCount}")
    println(s"字符数: ${analysis.characterCount}")
    
    // 提取引用
    val quotes = extractQuotes(sampleText)
    println(s"\n引用内容: $quotes")
    
    // 关键词高亮
    val highlighted = highlightKeywords(sampleText, List("scala", "programming"))
    println(s"\n关键词高亮:\n$highlighted")
    
    // Markdown 格式化
    val formatted = formatMarkdown(sampleText)
    println(s"\nMarkdown 格式化:\n$formatted")
  }
}

数据验证器

scala
object DataValidator {
  // 验证模式
  val EmailPattern = """^[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}$""".r
  val PhonePattern = """^\d{3}-\d{3}-\d{4}$""".r
  val ZipCodePattern = """^\d{5}(-\d{4})?$""".r
  val CreditCardPattern = """^\d{4}-\d{4}-\d{4}-\d{4}$""".r
  val PasswordPattern = """^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$""".r
  val UsernamePattern = """^[a-zA-Z0-9_]{3,20}$""".r
  val UrlPattern = """^https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?$""".r
  
  sealed trait ValidationResult
  case object Valid extends ValidationResult
  case class Invalid(message: String) extends ValidationResult
  
  case class ValidationRule(name: String, pattern: Regex, errorMessage: String)
  
  val validationRules = Map(
    "email" -> ValidationRule("邮箱", EmailPattern, "邮箱格式无效"),
    "phone" -> ValidationRule("电话", PhonePattern, "电话格式应为 XXX-XXX-XXXX"),
    "zipcode" -> ValidationRule("邮编", ZipCodePattern, "邮编格式无效"),
    "creditcard" -> ValidationRule("信用卡", CreditCardPattern, "信用卡格式应为 XXXX-XXXX-XXXX-XXXX"),
    "password" -> ValidationRule("密码", PasswordPattern, "密码必须包含大小写字母、数字和特殊字符,至少8位"),
    "username" -> ValidationRule("用户名", UsernamePattern, "用户名只能包含字母、数字和下划线,3-20位"),
    "url" -> ValidationRule("URL", UrlPattern, "URL格式无效")
  )
  
  def validate(fieldType: String, value: String): ValidationResult = {
    validationRules.get(fieldType) match {
      case Some(rule) =>
        if (rule.pattern.matches(value)) Valid
        else Invalid(rule.errorMessage)
      case None =>
        Invalid(s"未知的字段类型: $fieldType")
    }
  }
  
  def validateMultiple(data: Map[String, String]): Map[String, ValidationResult] = {
    data.map { case (fieldType, value) =>
      fieldType -> validate(fieldType, value)
    }
  }
  
  // 自定义验证器
  def validateAge(ageStr: String): ValidationResult = {
    val agePattern = """^\d+$""".r
    ageStr match {
      case agePattern() =>
        val age = ageStr.toInt
        if (age >= 0 && age <= 150) Valid
        else Invalid("年龄必须在0-150之间")
      case _ =>
        Invalid("年龄必须是数字")
    }
  }
  
  def validateDate(dateStr: String): ValidationResult = {
    val datePattern = """^(\d{4})-(\d{2})-(\d{2})$""".r
    dateStr match {
      case datePattern(year, month, day) =>
        val y = year.toInt
        val m = month.toInt
        val d = day.toInt
        
        if (m >= 1 && m <= 12 && d >= 1 && d <= 31 && y >= 1900 && y <= 2100) {
          Valid
        } else {
          Invalid("日期值超出有效范围")
        }
      case _ =>
        Invalid("日期格式应为 YYYY-MM-DD")
    }
  }
  
  def main(args: Array[String]): Unit = {
    val testData = Map(
      "email" -> "user@example.com",
      "phone" -> "555-123-4567",
      "zipcode" -> "12345",
      "creditcard" -> "1234-5678-9012-3456",
      "password" -> "MyPass123!",
      "username" -> "user_123",
      "url" -> "https://www.example.com"
    )
    
    val invalidData = Map(
      "email" -> "invalid-email",
      "phone" -> "123456789",
      "zipcode" -> "abc",
      "password" -> "weak"
    )
    
    println("有效数据验证:")
    validateMultiple(testData).foreach { case (field, result) =>
      println(s"$field: $result")
    }
    
    println("\n无效数据验证:")
    validateMultiple(invalidData).foreach { case (field, result) =>
      println(s"$field: $result")
    }
    
    println("\n自定义验证:")
    println(s"年龄 '25': ${validateAge("25")}")
    println(s"年龄 '200': ${validateAge("200")}")
    println(s"日期 '2023-12-25': ${validateDate("2023-12-25")}")
    println(s"日期 '2023-13-45': ${validateDate("2023-13-45")}")
  }
}

性能优化

编译和缓存

scala
object RegexPerformance {
  // 预编译正则表达式
  val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
  val PhonePattern = """\d{3}-\d{3}-\d{4}""".r
  
  // 避免在循环中重复编译
  def inefficientMatching(texts: List[String]): List[String] = {
    texts.filter { text =>
      // 每次都重新编译 - 效率低
      """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r.findFirstIn(text).isDefined
    }
  }
  
  def efficientMatching(texts: List[String]): List[String] = {
    texts.filter { text =>
      // 使用预编译的模式 - 效率高
      EmailPattern.findFirstIn(text).isDefined
    }
  }
  
  // 批量处理优化
  def batchProcess(texts: List[String]): Map[String, List[String]] = {
    val emailTexts = scala.collection.mutable.ListBuffer[String]()
    val phoneTexts = scala.collection.mutable.ListBuffer[String]()
    val otherTexts = scala.collection.mutable.ListBuffer[String]()
    
    texts.foreach { text =>
      if (EmailPattern.findFirstIn(text).isDefined) {
        emailTexts += text
      } else if (PhonePattern.findFirstIn(text).isDefined) {
        phoneTexts += text
      } else {
        otherTexts += text
      }
    }
    
    Map(
      "emails" -> emailTexts.toList,
      "phones" -> phoneTexts.toList,
      "others" -> otherTexts.toList
    )
  }
  
  def benchmarkRegex(): Unit = {
    val testTexts = List.fill(10000)("user@example.com") ++ 
                   List.fill(10000)("555-123-4567") ++ 
                   List.fill(10000)("random text")
    
    def timeOperation[T](name: String)(operation: => T): T = {
      val start = System.nanoTime()
      val result = operation
      val end = System.nanoTime()
      println(f"$name%20s: ${(end - start) / 1000000}%6d ms")
      result
    }
    
    println("正则表达式性能测试:")
    
    timeOperation("低效匹配") {
      inefficientMatching(testTexts.take(1000))
    }
    
    timeOperation("高效匹配") {
      efficientMatching(testTexts.take(1000))
    }
    
    timeOperation("批量处理") {
      batchProcess(testTexts)
    }
  }
  
  def main(args: Array[String]): Unit = {
    benchmarkRegex()
  }
}

最佳实践

  1. 预编译正则表达式

    • 避免在循环中重复编译
    • 使用 .r 方法或 new Regex()
    • 将常用模式定义为常量
  2. 使用原始字符串

    • 使用三重引号避免转义
    • 提高可读性和维护性
    • 减少转义字符的复杂性
  3. 合理使用捕获组

    • 只在需要时使用捕获组
    • 使用非捕获组 (?:...) 提高性能
    • 考虑使用命名捕获组提高可读性
  4. 模式匹配集成

    • 利用 Scala 的模式匹配特性
    • 使用 unapplyunapplySeq
    • 创建自定义提取器
  5. 性能考虑

    • 避免过于复杂的正则表达式
    • 考虑使用多个简单模式而不是一个复杂模式
    • 在处理大量数据时进行性能测试

正则表达式是文本处理的强大工具,在 Scala 中与模式匹配的结合使其更加优雅和强大。

本站内容仅供学习和研究使用。