Scala 正则表达式
正则表达式是处理文本和字符串匹配的强大工具。Scala 提供了丰富的正则表达式支持,包括模式匹配集成和便利的 API。
正则表达式基础
创建正则表达式
scala
import scala.util.matching.Regex
object RegexBasics {
def main(args: Array[String]): Unit = {
// 创建正则表达式的不同方式
val pattern1: Regex = "\\d+".r // 使用 .r 方法
val pattern2: Regex = new Regex("\\d+") // 使用构造函数
val pattern3: Regex = """(\d{4})-(\d{2})-(\d{2})""".r // 原始字符串
// 带标志的正则表达式
val caseInsensitive: Regex = "(?i)hello".r
val multiline: Regex = "(?m)^start".r
println("正则表达式创建成功")
// 基本匹配
val text = "The year 2023 has 365 days"
val numberPattern = "\\d+".r
// 查找第一个匹配
val firstMatch = numberPattern.findFirstIn(text)
println(s"第一个数字: ${firstMatch.getOrElse("未找到")}")
// 查找所有匹配
val allMatches = numberPattern.findAllIn(text).toList
println(s"所有数字: $allMatches")
// 检查是否匹配
val hasNumbers = numberPattern.findFirstIn(text).isDefined
println(s"包含数字: $hasNumbers")
}
}基本匹配操作
scala
object BasicMatching {
def main(args: Array[String]): Unit = {
val emailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val phonePattern = """\d{3}-\d{3}-\d{4}""".r
val datePattern = """\d{4}-\d{2}-\d{2}""".r
val text = """
联系信息:
邮箱: john.doe@example.com, alice@company.org
电话: 555-123-4567, 555-987-6543
日期: 2023-12-25, 2024-01-01
"""
// 查找所有邮箱
val emails = emailPattern.findAllIn(text).toList
println(s"邮箱地址: ${emails.mkString(", ")}")
// 查找所有电话
val phones = phonePattern.findAllIn(text).toList
println(s"电话号码: ${phones.mkString(", ")}")
// 查找所有日期
val dates = datePattern.findAllIn(text).toList
println(s"日期: ${dates.mkString(", ")}")
// 使用 findAllMatchIn 获取更多信息
println("\n详细匹配信息:")
emailPattern.findAllMatchIn(text).foreach { m =>
println(s"邮箱: '${m.matched}' 位置: ${m.start}-${m.end}")
}
}
}捕获组
基本捕获组
scala
object CaptureGroups {
def main(args: Array[String]): Unit = {
// 定义带捕获组的正则表达式
val namePattern = """(\w+)\s+(\w+)""".r // 名字和姓氏
val emailPattern = """(\w+)@(\w+\.\w+)""".r // 用户名和域名
val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r // 年月日
val text = "John Smith的邮箱是john@example.com,注册日期是2023-12-25"
// 提取名字
namePattern.findFirstMatchIn("John Smith") match {
case Some(m) =>
println(s"名字: ${m.group(1)}")
println(s"姓氏: ${m.group(2)}")
println(s"完整匹配: ${m.group(0)}")
case None =>
println("未找到名字模式")
}
// 提取邮箱信息
emailPattern.findFirstMatchIn(text) match {
case Some(m) =>
println(s"用户名: ${m.group(1)}")
println(s"域名: ${m.group(2)}")
case None =>
println("未找到邮箱")
}
// 提取日期信息
datePattern.findFirstMatchIn(text) match {
case Some(m) =>
println(s"年: ${m.group(1)}")
println(s"月: ${m.group(2)}")
println(s"日: ${m.group(3)}")
case None =>
println("未找到日期")
}
// 使用 unapplySeq 进行模式匹配
val dateString = "2023-12-25"
dateString match {
case datePattern(year, month, day) =>
println(s"模式匹配 - 年: $year, 月: $month, 日: $day")
case _ =>
println("日期格式不匹配")
}
}
}命名捕获组
scala
object NamedCaptureGroups {
def main(args: Array[String]): Unit = {
// Java 风格的命名捕获组(Scala 2.13+)
val logPattern = """(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?<level>\w+)\] (?<message>.+)""".r
val logEntry = "2023-12-25 10:30:45 [INFO] Application started successfully"
logPattern.findFirstMatchIn(logEntry) match {
case Some(m) =>
// 使用组名访问
println(s"时间戳: ${m.group("timestamp")}")
println(s"级别: ${m.group("level")}")
println(s"消息: ${m.group("message")}")
case None =>
println("日志格式不匹配")
}
// 替代方案:使用普通捕获组和 case class
case class LogEntry(timestamp: String, level: String, message: String)
val simpleLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)""".r
def parseLogEntry(log: String): Option[LogEntry] = log match {
case simpleLogPattern(timestamp, level, message) =>
Some(LogEntry(timestamp, level, message))
case _ => None
}
parseLogEntry(logEntry) match {
case Some(entry) =>
println(s"解析的日志: $entry")
case None =>
println("无法解析日志")
}
}
}字符串替换
基本替换操作
scala
object StringReplacement {
def main(args: Array[String]): Unit = {
val text = "The quick brown fox jumps over the lazy dog. The fox is quick."
// 简单替换
val pattern1 = "fox".r
val replaced1 = pattern1.replaceAllIn(text, "cat")
println(s"替换 fox -> cat: $replaced1")
// 替换第一个匹配
val replaced2 = pattern1.replaceFirstIn(text, "wolf")
println(s"替换第一个 fox -> wolf: $replaced2")
// 使用捕获组进行替换
val phonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
val phoneText = "电话号码: 555-123-4567 和 555-987-6543"
val formattedPhones = phonePattern.replaceAllIn(phoneText, "($1) $2-$3")
println(s"格式化电话: $formattedPhones")
// 使用函数进行替换
val numberPattern = """\d+""".r
val numberText = "我有5个苹果和10个橙子"
val doubledNumbers = numberPattern.replaceAllIn(numberText, m => (m.matched.toInt * 2).toString)
println(s"数字翻倍: $doubledNumbers")
// 条件替换
val wordPattern = """\b\w+\b""".r
val conditionalReplace = wordPattern.replaceAllIn(text, m =>
if (m.matched.length > 4) m.matched.toUpperCase else m.matched
)
println(s"长单词大写: $conditionalReplace")
}
}高级替换技巧
scala
object AdvancedReplacement {
def main(args: Array[String]): Unit = {
// HTML 标签清理
val htmlText = "<p>这是一个<strong>重要</strong>的<em>消息</em>。</p>"
val htmlTagPattern = """<[^>]+>""".r
val cleanText = htmlTagPattern.replaceAllIn(htmlText, "")
println(s"清理HTML: $cleanText")
// URL 转换为链接
val urlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
val textWithUrls = "访问 https://www.example.com 或 http://blog.example.org/post"
val linkedText = urlPattern.replaceAllIn(textWithUrls, m => s"<a href='${m.matched}'>${m.matched}</a>")
println(s"URL转链接: $linkedText")
// 敏感信息脱敏
val creditCardPattern = """(\d{4})-(\d{4})-(\d{4})-(\d{4})""".r
val sensitiveText = "信用卡号: 1234-5678-9012-3456"
val maskedText = creditCardPattern.replaceAllIn(sensitiveText, "$1-****-****-$4")
println(s"信用卡脱敏: $maskedText")
// 日期格式转换
val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r
val dateText = "会议日期: 2023-12-25"
val reformattedDate = datePattern.replaceAllIn(dateText, "$3/$2/$1")
println(s"日期格式转换: $reformattedDate")
// 多步骤替换
def cleanAndFormat(text: String): String = {
val step1 = """[^\w\s]""".r.replaceAllIn(text, "") // 移除标点
val step2 = """\s+""".r.replaceAllIn(step1, " ") // 合并空格
val step3 = step2.trim.toLowerCase // 转小写并去空格
step3
}
val messyText = " Hello, World!!! How are you??? "
println(s"清理格式化: '${cleanAndFormat(messyText)}'")
}
}模式匹配集成
正则表达式与模式匹配
scala
object RegexPatternMatching {
// 定义各种模式
val EmailPattern = """(\w+)@(\w+\.\w+)""".r
val PhonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
val DatePattern = """(\d{4})-(\d{2})-(\d{2})""".r
val TimePattern = """(\d{2}):(\d{2}):(\d{2})""".r
val IpPattern = """(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})""".r
def classifyInput(input: String): String = input.trim match {
case EmailPattern(user, domain) =>
s"邮箱地址: 用户名=$user, 域名=$domain"
case PhonePattern(area, exchange, number) =>
s"电话号码: 区号=$area, 交换机=$exchange, 号码=$number"
case DatePattern(year, month, day) =>
s"日期: $year年$month月$day日"
case TimePattern(hour, minute, second) =>
s"时间: $hour:$minute:$second"
case IpPattern(a, b, c, d) =>
s"IP地址: $a.$b.$c.$d"
case _ =>
s"未识别的格式: $input"
}
def validateAndExtract(input: String): Option[Map[String, String]] = input match {
case EmailPattern(user, domain) =>
Some(Map("type" -> "email", "user" -> user, "domain" -> domain))
case PhonePattern(area, exchange, number) =>
Some(Map("type" -> "phone", "area" -> area, "exchange" -> exchange, "number" -> number))
case DatePattern(year, month, day) if isValidDate(year.toInt, month.toInt, day.toInt) =>
Some(Map("type" -> "date", "year" -> year, "month" -> month, "day" -> day))
case _ => None
}
def isValidDate(year: Int, month: Int, day: Int): Boolean = {
month >= 1 && month <= 12 && day >= 1 && day <= 31
}
def main(args: Array[String]): Unit = {
val inputs = List(
"john@example.com",
"555-123-4567",
"2023-12-25",
"14:30:45",
"192.168.1.1",
"invalid-input",
"2023-13-45" // 无效日期
)
println("输入分类:")
inputs.foreach(input => println(s"$input -> ${classifyInput(input)}"))
println("\n验证和提取:")
inputs.foreach { input =>
validateAndExtract(input) match {
case Some(data) => println(s"$input -> 有效: $data")
case None => println(s"$input -> 无效")
}
}
}
}
```##
实际应用示例
### 日志分析器
```scala
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
object LogAnalyzer {
// 不同类型的日志模式
val ApacheLogPattern = """(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r
val ApplicationLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (\w+): (.+)""".r
val ErrorLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ERROR (.+) - (.+)""".r
case class ApacheLogEntry(
ip: String,
timestamp: String,
method: String,
url: String,
protocol: String,
status: Int,
size: Int
)
case class ApplicationLogEntry(
timestamp: String,
level: String,
logger: String,
message: String
)
case class ErrorLogEntry(
timestamp: String,
location: String,
message: String
)
def parseLogLine(line: String): Option[Any] = line match {
case ApacheLogPattern(ip, timestamp, method, url, protocol, status, size) =>
Some(ApacheLogEntry(ip, timestamp, method, url, protocol, status.toInt, size.toInt))
case ApplicationLogPattern(timestamp, level, logger, message) =>
Some(ApplicationLogEntry(timestamp, level, logger, message))
case ErrorLogPattern(timestamp, location, message) =>
Some(ErrorLogEntry(timestamp, location, message))
case _ => None
}
def analyzeApacheLogs(logs: List[ApacheLogEntry]): Unit = {
println("Apache 日志分析:")
// 状态码统计
val statusCounts = logs.groupBy(_.status).view.mapValues(_.size).toMap
println(s"状态码分布: $statusCounts")
// 最常访问的URL
val urlCounts = logs.groupBy(_.url).view.mapValues(_.size).toMap
val topUrls = urlCounts.toSeq.sortBy(-_._2).take(5)
println("最常访问的URL:")
topUrls.foreach { case (url, count) => println(s" $url: $count 次") }
// IP地址统计
val ipCounts = logs.groupBy(_.ip).view.mapValues(_.size).toMap
val topIps = ipCounts.toSeq.sortBy(-_._2).take(5)
println("最活跃的IP:")
topIps.foreach { case (ip, count) => println(s" $ip: $count 次") }
}
def main(args: Array[String]): Unit = {
val sampleLogs = List(
"""192.168.1.1 - - [25/Dec/2023:10:00:00 +0000] "GET /index.html HTTP/1.1" 200 1234""",
"""192.168.1.2 - - [25/Dec/2023:10:01:00 +0000] "POST /api/users HTTP/1.1" 201 567""",
"""2023-12-25 10:00:00 [INFO] UserService: User login successful""",
"""2023-12-25 10:01:00 ERROR DatabaseConnection - Connection timeout""",
"""192.168.1.1 - - [25/Dec/2023:10:02:00 +0000] "GET /about.html HTTP/1.1" 200 890"""
)
val parsedLogs = sampleLogs.flatMap(parseLogLine)
println("解析的日志条目:")
parsedLogs.foreach(println)
// 分析Apache日志
val apacheLogs = parsedLogs.collect { case log: ApacheLogEntry => log }
if (apacheLogs.nonEmpty) {
analyzeApacheLogs(apacheLogs)
}
}
}文本处理工具
scala
object TextProcessor {
// 各种文本模式
val UrlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val HashtagPattern = """#\w+""".r
val MentionPattern = """@\w+""".r
val PhonePattern = """\b\d{3}-\d{3}-\d{4}\b""".r
case class TextAnalysis(
urls: List[String],
emails: List[String],
hashtags: List[String],
mentions: List[String],
phones: List[String],
wordCount: Int,
characterCount: Int
)
def analyzeText(text: String): TextAnalysis = {
TextAnalysis(
urls = UrlPattern.findAllIn(text).toList,
emails = EmailPattern.findAllIn(text).toList,
hashtags = HashtagPattern.findAllIn(text).toList,
mentions = MentionPattern.findAllIn(text).toList,
phones = PhonePattern.findAllIn(text).toList,
wordCount = """\b\w+\b""".r.findAllIn(text).length,
characterCount = text.length
)
}
def extractQuotes(text: String): List[String] = {
val quotePattern = """"([^"]+)"""".r
quotePattern.findAllMatchIn(text).map(_.group(1)).toList
}
def highlightKeywords(text: String, keywords: List[String]): String = {
keywords.foldLeft(text) { (result, keyword) =>
val pattern = s"(?i)\\b$keyword\\b".r
pattern.replaceAllIn(result, m => s"**${m.matched}**")
}
}
def censorProfanity(text: String, profanityList: List[String]): String = {
profanityList.foldLeft(text) { (result, word) =>
val pattern = s"(?i)\\b$word\\b".r
pattern.replaceAllIn(result, "*" * word.length)
}
}
def extractCodeBlocks(text: String): List[String] = {
val codeBlockPattern = """```(\w+)?\n(.*?)\n```""".r
codeBlockPattern.findAllMatchIn(text).map(_.group(2)).toList
}
def formatMarkdown(text: String): String = {
var result = text
// 粗体
result = """\*\*([^*]+)\*\*""".r.replaceAllIn(result, "<strong>$1</strong>")
// 斜体
result = """\*([^*]+)\*""".r.replaceAllIn(result, "<em>$1</em>")
// 链接
result = """\[([^\]]+)\]\(([^)]+)\)""".r.replaceAllIn(result, "<a href='$2'>$1</a>")
// 代码
result = """`([^`]+)`""".r.replaceAllIn(result, "<code>$1</code>")
result
}
def main(args: Array[String]): Unit = {
val sampleText = """
查看我的网站 https://www.example.com 或发邮件到 contact@example.com
关注我 @username 并使用标签 #scala #programming
电话: 555-123-4567
"这是一个引用的文本"
**粗体文本** 和 *斜体文本*
[链接文本](https://link.com)
`代码片段`
""".trim
// 文本分析
val analysis = analyzeText(sampleText)
println("文本分析结果:")
println(s"URLs: ${analysis.urls}")
println(s"邮箱: ${analysis.emails}")
println(s"标签: ${analysis.hashtags}")
println(s"提及: ${analysis.mentions}")
println(s"电话: ${analysis.phones}")
println(s"单词数: ${analysis.wordCount}")
println(s"字符数: ${analysis.characterCount}")
// 提取引用
val quotes = extractQuotes(sampleText)
println(s"\n引用内容: $quotes")
// 关键词高亮
val highlighted = highlightKeywords(sampleText, List("scala", "programming"))
println(s"\n关键词高亮:\n$highlighted")
// Markdown 格式化
val formatted = formatMarkdown(sampleText)
println(s"\nMarkdown 格式化:\n$formatted")
}
}数据验证器
scala
object DataValidator {
// 验证模式
val EmailPattern = """^[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}$""".r
val PhonePattern = """^\d{3}-\d{3}-\d{4}$""".r
val ZipCodePattern = """^\d{5}(-\d{4})?$""".r
val CreditCardPattern = """^\d{4}-\d{4}-\d{4}-\d{4}$""".r
val PasswordPattern = """^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$""".r
val UsernamePattern = """^[a-zA-Z0-9_]{3,20}$""".r
val UrlPattern = """^https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?$""".r
sealed trait ValidationResult
case object Valid extends ValidationResult
case class Invalid(message: String) extends ValidationResult
case class ValidationRule(name: String, pattern: Regex, errorMessage: String)
val validationRules = Map(
"email" -> ValidationRule("邮箱", EmailPattern, "邮箱格式无效"),
"phone" -> ValidationRule("电话", PhonePattern, "电话格式应为 XXX-XXX-XXXX"),
"zipcode" -> ValidationRule("邮编", ZipCodePattern, "邮编格式无效"),
"creditcard" -> ValidationRule("信用卡", CreditCardPattern, "信用卡格式应为 XXXX-XXXX-XXXX-XXXX"),
"password" -> ValidationRule("密码", PasswordPattern, "密码必须包含大小写字母、数字和特殊字符,至少8位"),
"username" -> ValidationRule("用户名", UsernamePattern, "用户名只能包含字母、数字和下划线,3-20位"),
"url" -> ValidationRule("URL", UrlPattern, "URL格式无效")
)
def validate(fieldType: String, value: String): ValidationResult = {
validationRules.get(fieldType) match {
case Some(rule) =>
if (rule.pattern.matches(value)) Valid
else Invalid(rule.errorMessage)
case None =>
Invalid(s"未知的字段类型: $fieldType")
}
}
def validateMultiple(data: Map[String, String]): Map[String, ValidationResult] = {
data.map { case (fieldType, value) =>
fieldType -> validate(fieldType, value)
}
}
// 自定义验证器
def validateAge(ageStr: String): ValidationResult = {
val agePattern = """^\d+$""".r
ageStr match {
case agePattern() =>
val age = ageStr.toInt
if (age >= 0 && age <= 150) Valid
else Invalid("年龄必须在0-150之间")
case _ =>
Invalid("年龄必须是数字")
}
}
def validateDate(dateStr: String): ValidationResult = {
val datePattern = """^(\d{4})-(\d{2})-(\d{2})$""".r
dateStr match {
case datePattern(year, month, day) =>
val y = year.toInt
val m = month.toInt
val d = day.toInt
if (m >= 1 && m <= 12 && d >= 1 && d <= 31 && y >= 1900 && y <= 2100) {
Valid
} else {
Invalid("日期值超出有效范围")
}
case _ =>
Invalid("日期格式应为 YYYY-MM-DD")
}
}
def main(args: Array[String]): Unit = {
val testData = Map(
"email" -> "user@example.com",
"phone" -> "555-123-4567",
"zipcode" -> "12345",
"creditcard" -> "1234-5678-9012-3456",
"password" -> "MyPass123!",
"username" -> "user_123",
"url" -> "https://www.example.com"
)
val invalidData = Map(
"email" -> "invalid-email",
"phone" -> "123456789",
"zipcode" -> "abc",
"password" -> "weak"
)
println("有效数据验证:")
validateMultiple(testData).foreach { case (field, result) =>
println(s"$field: $result")
}
println("\n无效数据验证:")
validateMultiple(invalidData).foreach { case (field, result) =>
println(s"$field: $result")
}
println("\n自定义验证:")
println(s"年龄 '25': ${validateAge("25")}")
println(s"年龄 '200': ${validateAge("200")}")
println(s"日期 '2023-12-25': ${validateDate("2023-12-25")}")
println(s"日期 '2023-13-45': ${validateDate("2023-13-45")}")
}
}性能优化
编译和缓存
scala
object RegexPerformance {
// 预编译正则表达式
val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val PhonePattern = """\d{3}-\d{3}-\d{4}""".r
// 避免在循环中重复编译
def inefficientMatching(texts: List[String]): List[String] = {
texts.filter { text =>
// 每次都重新编译 - 效率低
"""[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r.findFirstIn(text).isDefined
}
}
def efficientMatching(texts: List[String]): List[String] = {
texts.filter { text =>
// 使用预编译的模式 - 效率高
EmailPattern.findFirstIn(text).isDefined
}
}
// 批量处理优化
def batchProcess(texts: List[String]): Map[String, List[String]] = {
val emailTexts = scala.collection.mutable.ListBuffer[String]()
val phoneTexts = scala.collection.mutable.ListBuffer[String]()
val otherTexts = scala.collection.mutable.ListBuffer[String]()
texts.foreach { text =>
if (EmailPattern.findFirstIn(text).isDefined) {
emailTexts += text
} else if (PhonePattern.findFirstIn(text).isDefined) {
phoneTexts += text
} else {
otherTexts += text
}
}
Map(
"emails" -> emailTexts.toList,
"phones" -> phoneTexts.toList,
"others" -> otherTexts.toList
)
}
def benchmarkRegex(): Unit = {
val testTexts = List.fill(10000)("user@example.com") ++
List.fill(10000)("555-123-4567") ++
List.fill(10000)("random text")
def timeOperation[T](name: String)(operation: => T): T = {
val start = System.nanoTime()
val result = operation
val end = System.nanoTime()
println(f"$name%20s: ${(end - start) / 1000000}%6d ms")
result
}
println("正则表达式性能测试:")
timeOperation("低效匹配") {
inefficientMatching(testTexts.take(1000))
}
timeOperation("高效匹配") {
efficientMatching(testTexts.take(1000))
}
timeOperation("批量处理") {
batchProcess(testTexts)
}
}
def main(args: Array[String]): Unit = {
benchmarkRegex()
}
}最佳实践
预编译正则表达式:
- 避免在循环中重复编译
- 使用
.r方法或new Regex() - 将常用模式定义为常量
使用原始字符串:
- 使用三重引号避免转义
- 提高可读性和维护性
- 减少转义字符的复杂性
合理使用捕获组:
- 只在需要时使用捕获组
- 使用非捕获组
(?:...)提高性能 - 考虑使用命名捕获组提高可读性
模式匹配集成:
- 利用 Scala 的模式匹配特性
- 使用
unapply和unapplySeq - 创建自定义提取器
性能考虑:
- 避免过于复杂的正则表达式
- 考虑使用多个简单模式而不是一个复杂模式
- 在处理大量数据时进行性能测试
正则表达式是文本处理的强大工具,在 Scala 中与模式匹配的结合使其更加优雅和强大。