Skip to content

正则表达式

概述

正则表达式是处理文本的强大工具,用于模式匹配、文本搜索、替换和验证。Kotlin提供了完整的正则表达式支持,基于Java的Pattern和Matcher类,同时提供了更简洁的Kotlin风格API。

正则表达式基础

创建正则表达式

kotlin
fun main() {
    println("=== 正则表达式创建 ===")
    
    // 1. 使用Regex构造函数
    val regex1 = Regex("hello")
    val regex2 = Regex("[0-9]+")
    val regex3 = Regex("\\d{3}-\\d{3}-\\d{4}")  // 电话号码格式
    
    // 2. 使用字符串的toRegex()扩展函数
    val regex4 = "world".toRegex()
    val regex5 = "[a-zA-Z]+".toRegex()
    
    // 3. 使用原始字符串避免转义
    val regex6 = Regex("""^\d{4}-\d{2}-\d{2}$""")  // 日期格式
    val regex7 = Regex("""[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}""")  // 邮箱
    
    // 4. 带选项的正则表达式
    val regex8 = Regex("HELLO", RegexOption.IGNORE_CASE)
    val regex9 = Regex("""
        \d{3}     # 区号
        -         # 分隔符
        \d{3}     # 前三位
        -         # 分隔符  
        \d{4}     # 后四位
    """.trimIndent(), setOf(RegexOption.COMMENTS, RegexOption.IGNORE_CASE))
    
    println("正则表达式创建完成")
    
    // 测试基本匹配
    val testString = "Hello World 123"
    println("测试字符串: '$testString'")
    println("包含'hello'(忽略大小写): ${regex8.containsMatchIn(testString)}")
    println("包含数字: ${regex2.containsMatchIn(testString)}")
    println("包含字母: ${regex5.containsMatchIn(testString)}")
}

基本匹配操作

kotlin
fun main() {
    println("=== 基本匹配操作 ===")
    
    val text = "联系电话:138-1234-5678,邮箱:user@example.com,日期:2023-12-25"
    
    // 1. 检查是否包含匹配
    val phoneRegex = Regex("""\d{3}-\d{4}-\d{4}""")
    val emailRegex = Regex("""[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}""")
    val dateRegex = Regex("""\d{4}-\d{2}-\d{2}""")
    
    println("文本: $text")
    println("包含电话号码: ${phoneRegex.containsMatchIn(text)}")
    println("包含邮箱: ${emailRegex.containsMatchIn(text)}")
    println("包含日期: ${dateRegex.containsMatchIn(text)}")
    
    // 2. 查找第一个匹配
    val phoneMatch = phoneRegex.find(text)
    phoneMatch?.let { match ->
        println("找到电话号码: ${match.value}")
        println("位置: ${match.range}")
    }
    
    val emailMatch = emailRegex.find(text)
    emailMatch?.let { match ->
        println("找到邮箱: ${match.value}")
        println("开始位置: ${match.range.first}")
        println("结束位置: ${match.range.last}")
    }
    
    // 3. 查找所有匹配
    val numberRegex = Regex("""\d+""")
    val allNumbers = numberRegex.findAll(text)
    
    println("所有数字:")
    allNumbers.forEach { match ->
        println("  ${match.value} at ${match.range}")
    }
    
    // 4. 完全匹配
    val phoneNumber = "138-1234-5678"
    val fullPhoneRegex = Regex("""^\d{3}-\d{4}-\d{4}$""")
    
    println("'$phoneNumber' 是完整的电话号码: ${fullPhoneRegex.matches(phoneNumber)}")
    println("'138-1234' 是完整的电话号码: ${fullPhoneRegex.matches("138-1234")}")
    
    // 5. 匹配结果转换
    val numbers = numberRegex.findAll(text)
        .map { it.value.toInt() }
        .toList()
    
    println("提取的数字: $numbers")
}

正则表达式模式

字符类和量词

kotlin
fun main() {
    println("=== 字符类和量词 ===")
    
    val testStrings = listOf(
        "abc123",
        "ABC",
        "123",
        "hello@world.com",
        "test_file.txt",
        "special-chars!@#",
        "   spaces   ",
        "newline\nhere",
        "tab\there"
    )
    
    // 字符类模式
    val patterns = mapOf(
        "数字" to Regex("""\d+"""),                    // 一个或多个数字
        "字母" to Regex("""[a-zA-Z]+"""),              // 一个或多个字母
        "字母数字" to Regex("""\w+"""),                 // 一个或多个单词字符
        "小写字母" to Regex("""[a-z]+"""),              // 一个或多个小写字母
        "大写字母" to Regex("""[A-Z]+"""),              // 一个或多个大写字母
        "特殊字符" to Regex("""[!@#$%^&*()]+"""),       // 特殊字符
        "空白字符" to Regex("""\s+"""),                 // 空白字符
        "非数字" to Regex("""\D+"""),                   // 非数字字符
        "非单词字符" to Regex("""\W+"""),               // 非单词字符
        "邮箱格式" to Regex("""\w+@\w+\.\w+""")         // 简单邮箱格式
    )
    
    testStrings.forEach { testString ->
        println("测试字符串: '$testString'")
        patterns.forEach { (name, regex) ->
            val matches = regex.findAll(testString).map { it.value }.toList()
            if (matches.isNotEmpty()) {
                println("  $name: $matches")
            }
        }
        println()
    }
    
    // 量词示例
    println("=== 量词示例 ===")
    val quantifierTests = mapOf(
        "a?" to listOf("", "a", "aa", "aaa"),           // 0或1个a
        "a*" to listOf("", "a", "aa", "aaa"),           // 0个或多个a
        "a+" to listOf("", "a", "aa", "aaa"),           // 1个或多个a
        "a{2}" to listOf("a", "aa", "aaa"),             // 恰好2个a
        "a{2,}" to listOf("a", "aa", "aaa", "aaaa"),    // 2个或更多a
        "a{1,3}" to listOf("", "a", "aa", "aaa", "aaaa") // 1到3个a
    )
    
    quantifierTests.forEach { (pattern, tests) ->
        println("模式: $pattern")
        val regex = Regex("^$pattern$")
        tests.forEach { test ->
            val matches = regex.matches(test)
            println("  '$test': $matches")
        }
        println()
    }
}

分组和捕获

kotlin
fun main() {
    println("=== 分组和捕获 ===")
    
    // 1. 基本分组
    val dateText = "今天是2023年12月25日,明天是2023年12月26日"
    val dateRegex = Regex("""(\d{4})年(\d{1,2})月(\d{1,2})日""")
    
    println("文本: $dateText")
    println("查找所有日期:")
    
    dateRegex.findAll(dateText).forEach { match ->
        println("完整匹配: ${match.value}")
        println("年份: ${match.groupValues[1]}")
        println("月份: ${match.groupValues[2]}")
        println("日期: ${match.groupValues[3]}")
        println("所有分组: ${match.groupValues}")
        println()
    }
    
    // 2. 命名分组
    val namedDateRegex = Regex("""(?<year>\d{4})年(?<month>\d{1,2})月(?<day>\d{1,2})日""")
    
    println("使用命名分组:")
    namedDateRegex.find(dateText)?.let { match ->
        println("年份: ${match.groups["year"]?.value}")
        println("月份: ${match.groups["month"]?.value}")
        println("日期: ${match.groups["day"]?.value}")
    }
    
    // 3. 电话号码解析
    val phoneText = "联系方式:+86-138-1234-5678 或 021-12345678"
    val phoneRegex = Regex("""(?:(\+\d{1,3})-)?(\d{3,4})-(\d{4,8})""")
    
    println("电话号码解析:")
    phoneRegex.findAll(phoneText).forEach { match ->
        val countryCode = match.groupValues[1].takeIf { it.isNotEmpty() } ?: "无"
        val areaCode = match.groupValues[2]
        val number = match.groupValues[3]
        
        println("完整号码: ${match.value}")
        println("国家代码: $countryCode")
        println("区号: $areaCode")
        println("号码: $number")
        println()
    }
    
    // 4. 邮箱地址解析
    val emailText = "联系邮箱:john.doe@company.com 和 admin@test.org"
    val emailRegex = Regex("""([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})""")
    
    println("邮箱地址解析:")
    emailRegex.findAll(emailText).forEach { match ->
        println("完整邮箱: ${match.value}")
        println("用户名: ${match.groupValues[1]}")
        println("域名: ${match.groupValues[2]}")
        println("顶级域名: ${match.groupValues[3]}")
        println()
    }
    
    // 5. URL解析
    val urlText = "访问 https://www.example.com:8080/path/to/page?param=value#section"
    val urlRegex = Regex("""(https?)://([^:/]+)(?::(\d+))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?""")
    
    println("URL解析:")
    urlRegex.find(urlText)?.let { match ->
        println("完整URL: ${match.value}")
        println("协议: ${match.groupValues[1]}")
        println("主机: ${match.groupValues[2]}")
        println("端口: ${match.groupValues[3].takeIf { it.isNotEmpty() } ?: "默认"}")
        println("路径: ${match.groupValues[4]}")
        println("查询参数: ${match.groupValues[5]}")
        println("锚点: ${match.groupValues[6]}")
    }
}

文本替换和处理

基本替换操作

kotlin
fun main() {
    println("=== 文本替换操作 ===")
    
    val originalText = """
        用户信息:
        姓名:张三
        电话:138-1234-5678
        邮箱:zhangsan@example.com
        生日:1990-05-15
        地址:北京市朝阳区某某街道123号
    """.trimIndent()
    
    println("原始文本:")
    println(originalText)
    println()
    
    // 1. 简单替换
    val hiddenPhoneText = originalText.replace(Regex("""\d{3}-\d{4}-\d{4}"""), "***-****-****")
    println("隐藏电话号码:")
    println(hiddenPhoneText)
    println()
    
    // 2. 使用分组进行替换
    val formattedDateText = originalText.replace(
        Regex("""(\d{4})-(\d{2})-(\d{2})"""),
        "$1年$2月$3日"
    )
    println("格式化日期:")
    println(formattedDateText)
    println()
    
    // 3. 使用替换函数
    val maskedEmailText = originalText.replace(Regex("""([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})""")) { match ->
        val username = match.groupValues[1]
        val domain = match.groupValues[2]
        val maskedUsername = username.take(2) + "*".repeat(maxOf(0, username.length - 2))
        "$maskedUsername@$domain"
    }
    println("掩码邮箱:")
    println(maskedEmailText)
    println()
    
    // 4. 复杂替换:格式化电话号码
    val phoneFormatText = "电话号码:13812345678, 02112345678, +8613812345678"
    val formattedPhones = phoneFormatText.replace(
        Regex("""(?:\+86)?(\d{3})(\d{4})(\d{4})""")
    ) { match ->
        "${match.groupValues[1]}-${match.groupValues[2]}-${match.groupValues[3]}"
    }
    println("格式化电话号码:")
    println("原文: $phoneFormatText")
    println("格式化后: $formattedPhones")
    println()
    
    // 5. 清理和标准化文本
    val messyText = "  Hello    World!  \n\n  How   are    you?  \t\t  "
    val cleanedText = messyText
        .replace(Regex("""\s+"""), " ")  // 多个空白字符替换为单个空格
        .trim()                          // 去除首尾空白
    
    println("文本清理:")
    println("原文: '$messyText'")
    println("清理后: '$cleanedText'")
}

高级替换技巧

kotlin
fun main() {
    println("=== 高级替换技巧 ===")
    
    // 1. 条件替换
    val priceText = "商品价格:$100.50, ¥200.30, €150.75, £80.25"
    
    val convertedPrices = priceText.replace(Regex("""([¥€£$])(\d+\.?\d*)""")) { match ->
        val currency = match.groupValues[1]
        val amount = match.groupValues[2].toDouble()
        
        val convertedAmount = when (currency) {
            "$" -> amount * 7.2  // 美元转人民币
            "€" -> amount * 7.8  // 欧元转人民币
            "£" -> amount * 9.1  // 英镑转人民币
            "¥" -> amount        // 人民币保持不变
            else -> amount
        }
        
        "¥${"%.2f".format(convertedAmount)}"
    }
    
    println("货币转换:")
    println("原文: $priceText")
    println("转换后: $convertedPrices")
    println()
    
    // 2. 文本格式化
    val codeText = """
        function calculateSum(a,b){
        return a+b;
        }
        
        var result=calculateSum(10,20);
        console.log(result);
    """.trimIndent()
    
    // 格式化JavaScript代码
    val formattedCode = codeText
        .replace(Regex("""(\w+)\s*\(\s*(\w+)\s*,\s*(\w+)\s*\)""")) { match ->
            "${match.groupValues[1]}(${match.groupValues[2]}, ${match.groupValues[3]})"
        }
        .replace(Regex("""(\w+)\s*=\s*(\w+)""")) { match ->
            "${match.groupValues[1]} = ${match.groupValues[2]}"
        }
        .replace(Regex("""\{\s*"""), " {\n    ")
        .replace(Regex("""\s*\}"""), "\n}")
        .replace(Regex(""";(\w)"""), ";\n$1")
    
    println("代码格式化:")
    println("原代码:")
    println(codeText)
    println("\n格式化后:")
    println(formattedCode)
    println()
    
    // 3. 模板替换
    val template = "Hello {{name}}, your order #{{orderNumber}} for {{amount}} has been {{status}}."
    val variables = mapOf(
        "name" to "张三",
        "orderNumber" to "12345",
        "amount" to "$99.99",
        "status" to "confirmed"
    )
    
    val processedTemplate = template.replace(Regex("""\{\{(\w+)\}\}""")) { match ->
        val variableName = match.groupValues[1]
        variables[variableName] ?: match.value
    }
    
    println("模板替换:")
    println("模板: $template")
    println("处理后: $processedTemplate")
    println()
    
    // 4. 文本脱敏
    val sensitiveText = """
        身份证号:110101199001011234
        银行卡号:6222021234567890123
        手机号:13812345678
    """.trimIndent()
    
    val desensitizedText = sensitiveText
        .replace(Regex("""(\d{6})\d{8}(\d{4})""")) { match ->  // 身份证
            "${match.groupValues[1]}********${match.groupValues[2]}"
        }
        .replace(Regex("""(\d{4})\d{11}(\d{4})""")) { match ->  // 银行卡
            "${match.groupValues[1]}***********${match.groupValues[2]}"
        }
        .replace(Regex("""(\d{3})\d{4}(\d{4})""")) { match ->   // 手机号
            "${match.groupValues[1]}****${match.groupValues[2]}"
        }
    
    println("数据脱敏:")
    println("原文:")
    println(sensitiveText)
    println("\n脱敏后:")
    println(desensitizedText)
}

数据验证

常用验证模式

kotlin
class DataValidator {
    
    companion object {
        // 常用正则表达式模式
        val EMAIL_PATTERN = Regex("""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$""")
        val PHONE_PATTERN = Regex("""^1[3-9]\d{9}$""")  // 中国手机号
        val ID_CARD_PATTERN = Regex("""^\d{17}[\dXx]$""")  // 身份证号
        val PASSWORD_PATTERN = Regex("""^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$""")
        val URL_PATTERN = Regex("""^https?://[^\s/$.?#].[^\s]*$""")
        val IPV4_PATTERN = Regex("""^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$""")
        val DATE_PATTERN = Regex("""^\d{4}-\d{2}-\d{2}$""")
        val TIME_PATTERN = Regex("""^([01]?[0-9]|2[0-3]):[0-5][0-9]$""")
        val CREDIT_CARD_PATTERN = Regex("""^\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}$""")
    }
    
    fun validateEmail(email: String): ValidationResult {
        return if (EMAIL_PATTERN.matches(email)) {
            ValidationResult.Success
        } else {
            ValidationResult.Error("邮箱格式不正确")
        }
    }
    
    fun validatePhone(phone: String): ValidationResult {
        return if (PHONE_PATTERN.matches(phone)) {
            ValidationResult.Success
        } else {
            ValidationResult.Error("手机号格式不正确")
        }
    }
    
    fun validatePassword(password: String): ValidationResult {
        return when {
            password.length < 8 -> ValidationResult.Error("密码长度至少8位")
            !password.contains(Regex("[a-z]")) -> ValidationResult.Error("密码必须包含小写字母")
            !password.contains(Regex("[A-Z]")) -> ValidationResult.Error("密码必须包含大写字母")
            !password.contains(Regex("\\d")) -> ValidationResult.Error("密码必须包含数字")
            !password.contains(Regex("[@$!%*?&]")) -> ValidationResult.Error("密码必须包含特殊字符")
            else -> ValidationResult.Success
        }
    }
    
    fun validateIdCard(idCard: String): ValidationResult {
        if (!ID_CARD_PATTERN.matches(idCard)) {
            return ValidationResult.Error("身份证号格式不正确")
        }
        
        // 验证校验位
        val weights = intArrayOf(7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2)
        val checkCodes = charArrayOf('1', '0', 'X', '9', '8', '7', '6', '5', '4', '3', '2')
        
        val sum = idCard.take(17).mapIndexed { index, char ->
            char.digitToInt() * weights[index]
        }.sum()
        
        val expectedCheckCode = checkCodes[sum % 11]
        val actualCheckCode = idCard.last().uppercaseChar()
        
        return if (expectedCheckCode == actualCheckCode) {
            ValidationResult.Success
        } else {
            ValidationResult.Error("身份证号校验位不正确")
        }
    }
    
    fun validateUrl(url: String): ValidationResult {
        return if (URL_PATTERN.matches(url)) {
            ValidationResult.Success
        } else {
            ValidationResult.Error("URL格式不正确")
        }
    }
    
    fun validateIPv4(ip: String): ValidationResult {
        return if (IPV4_PATTERN.matches(ip)) {
            ValidationResult.Success
        } else {
            ValidationResult.Error("IPv4地址格式不正确")
        }
    }
    
    fun validateDate(date: String): ValidationResult {
        if (!DATE_PATTERN.matches(date)) {
            return ValidationResult.Error("日期格式不正确,应为YYYY-MM-DD")
        }
        
        val parts = date.split("-")
        val year = parts[0].toInt()
        val month = parts[1].toInt()
        val day = parts[2].toInt()
        
        return when {
            year < 1900 || year > 2100 -> ValidationResult.Error("年份应在1900-2100之间")
            month < 1 || month > 12 -> ValidationResult.Error("月份应在1-12之间")
            day < 1 || day > 31 -> ValidationResult.Error("日期应在1-31之间")
            month == 2 && day > 29 -> ValidationResult.Error("2月份日期不能超过29")
            month == 2 && day == 29 && !isLeapYear(year) -> ValidationResult.Error("非闰年2月不能有29日")
            (month == 4 || month == 6 || month == 9 || month == 11) && day > 30 -> 
                ValidationResult.Error("该月份只有30天")
            else -> ValidationResult.Success
        }
    }
    
    private fun isLeapYear(year: Int): Boolean {
        return (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0)
    }
    
    sealed class ValidationResult {
        object Success : ValidationResult()
        data class Error(val message: String) : ValidationResult()
    }
}

fun main() {
    println("=== 数据验证示例 ===")
    
    val validator = DataValidator()
    
    // 测试数据
    val testData = mapOf(
        "邮箱" to listOf("user@example.com", "invalid-email", "test@domain"),
        "手机号" to listOf("13812345678", "12345678901", "1381234567"),
        "密码" to listOf("Password123!", "password", "PASSWORD123", "Pass123"),
        "身份证" to listOf("110101199001011234", "11010119900101123X", "123456789012345678"),
        "URL" to listOf("https://www.example.com", "http://test.org", "invalid-url"),
        "IP地址" to listOf("192.168.1.1", "255.255.255.255", "256.1.1.1", "192.168.1"),
        "日期" to listOf("2023-12-25", "2023-02-29", "2024-02-29", "2023-13-01")
    )
    
    testData.forEach { (type, values) ->
        println("=== $type 验证 ===")
        values.forEach { value ->
            val result = when (type) {
                "邮箱" -> validator.validateEmail(value)
                "手机号" -> validator.validatePhone(value)
                "密码" -> validator.validatePassword(value)
                "身份证" -> validator.validateIdCard(value)
                "URL" -> validator.validateUrl(value)
                "IP地址" -> validator.validateIPv4(value)
                "日期" -> validator.validateDate(value)
                else -> DataValidator.ValidationResult.Error("未知类型")
            }
            
            val status = when (result) {
                is DataValidator.ValidationResult.Success -> "✓ 有效"
                is DataValidator.ValidationResult.Error -> "✗ ${result.message}"
            }
            
            println("$value: $status")
        }
        println()
    }
}

实际应用示例

日志分析器

kotlin
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter

data class LogEntry(
    val timestamp: LocalDateTime,
    val level: String,
    val logger: String,
    val message: String,
    val thread: String? = null,
    val exception: String? = null
)

class LogAnalyzer {
    
    // 不同日志格式的正则表达式
    private val logPatterns = mapOf(
        "standard" to Regex("""(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(\w+)\s+\[([^\]]+)\]\s+(.+)"""),
        "detailed" to Regex("""(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\s+(\w+)\s+(\d+)\s+---\s+\[([^\]]+)\]\s+([^:]+):\s+(.+)"""),
        "apache" to Regex("""(\d+\.\d+\.\d+\.\d+)\s+-\s+-\s+\[([^\]]+)\]\s+"([^"]+)"\s+(\d+)\s+(\d+)""")
    )
    
    private val dateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
    private val detailedDateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS")
    
    fun parseLogFile(logContent: String): List<LogEntry> {
        val entries = mutableListOf<LogEntry>()
        val lines = logContent.lines()
        
        for (line in lines) {
            if (line.isBlank()) continue
            
            val entry = parseLogLine(line)
            if (entry != null) {
                entries.add(entry)
            }
        }
        
        return entries
    }
    
    private fun parseLogLine(line: String): LogEntry? {
        // 尝试标准格式
        logPatterns["standard"]?.find(line)?.let { match ->
            return LogEntry(
                timestamp = LocalDateTime.parse(match.groupValues[1], dateTimeFormatter),
                level = match.groupValues[2],
                logger = match.groupValues[3],
                message = match.groupValues[4]
            )
        }
        
        // 尝试详细格式
        logPatterns["detailed"]?.find(line)?.let { match ->
            return LogEntry(
                timestamp = LocalDateTime.parse(match.groupValues[1], detailedDateTimeFormatter),
                level = match.groupValues[2],
                thread = match.groupValues[3],
                logger = match.groupValues[4],
                message = match.groupValues[5]
            )
        }
        
        return null
    }
    
    fun analyzeErrors(entries: List<LogEntry>): Map<String, Any> {
        val errorEntries = entries.filter { it.level == "ERROR" }
        val errorsByLogger = errorEntries.groupBy { it.logger }
        val errorsByHour = errorEntries.groupBy { it.timestamp.hour }
        
        // 提取异常类型
        val exceptionPattern = Regex("""([a-zA-Z.]+Exception)""")
        val exceptionTypes = errorEntries.mapNotNull { entry ->
            exceptionPattern.find(entry.message)?.groupValues?.get(1)
        }.groupingBy { it }.eachCount()
        
        return mapOf(
            "totalErrors" to errorEntries.size,
            "errorsByLogger" to errorsByLogger.mapValues { it.value.size },
            "errorsByHour" to errorsByHour.mapValues { it.value.size },
            "topExceptions" to exceptionTypes.toList().sortedByDescending { it.second }.take(5)
        )
    }
    
    fun extractIpAddresses(entries: List<LogEntry>): Set<String> {
        val ipPattern = Regex("""\b(?:\d{1,3}\.){3}\d{1,3}\b""")
        return entries.flatMap { entry ->
            ipPattern.findAll(entry.message).map { it.value }
        }.toSet()
    }
    
    fun findSuspiciousActivity(entries: List<LogEntry>): List<String> {
        val suspiciousPatterns = listOf(
            Regex("""(?i)sql\s+injection"""),
            Regex("""(?i)xss\s+attack"""),
            Regex("""(?i)unauthorized\s+access"""),
            Regex("""(?i)brute\s+force"""),
            Regex("""(?i)ddos"""),
            Regex("""(?i)malware""")
        )
        
        val suspiciousEntries = mutableListOf<String>()
        
        entries.forEach { entry ->
            suspiciousPatterns.forEach { pattern ->
                if (pattern.containsMatchIn(entry.message)) {
                    suspiciousEntries.add("${entry.timestamp}: ${entry.message}")
                }
            }
        }
        
        return suspiciousEntries
    }
    
    fun generateReport(entries: List<LogEntry>): String {
        val errorAnalysis = analyzeErrors(entries)
        val ipAddresses = extractIpAddresses(entries)
        val suspiciousActivity = findSuspiciousActivity(entries)
        
        return buildString {
            appendLine("=== 日志分析报告 ===")
            appendLine("总日志条数: ${entries.size}")
            appendLine("时间范围: ${entries.minByOrNull { it.timestamp }?.timestamp} 到 ${entries.maxByOrNull { it.timestamp }?.timestamp}")
            appendLine()
            
            appendLine("=== 错误分析 ===")
            appendLine("总错误数: ${errorAnalysis["totalErrors"]}")
            
            @Suppress("UNCHECKED_CAST")
            val errorsByLogger = errorAnalysis["errorsByLogger"] as Map<String, Int>
            appendLine("按日志器分组的错误:")
            errorsByLogger.forEach { (logger, count) ->
                appendLine("  $logger: $count")
            }
            
            @Suppress("UNCHECKED_CAST")
            val topExceptions = errorAnalysis["topExceptions"] as List<Pair<String, Int>>
            appendLine("主要异常类型:")
            topExceptions.forEach { (exception, count) ->
                appendLine("  $exception: $count")
            }
            appendLine()
            
            appendLine("=== IP地址统计 ===")
            appendLine("发现的IP地址数量: ${ipAddresses.size}")
            ipAddresses.take(10).forEach { ip ->
                appendLine("  $ip")
            }
            appendLine()
            
            appendLine("=== 可疑活动 ===")
            if (suspiciousActivity.isNotEmpty()) {
                appendLine("发现 ${suspiciousActivity.size} 条可疑活动:")
                suspiciousActivity.take(5).forEach { activity ->
                    appendLine("  $activity")
                }
            } else {
                appendLine("未发现可疑活动")
            }
        }
    }
}

fun main() {
    println("=== 日志分析器示例 ===")
    
    // 模拟日志内容
    val logContent = """
        2023-12-25 10:30:15 INFO [com.example.UserService] User login successful: user123
        2023-12-25 10:30:16 DEBUG [com.example.DatabaseConnection] Connection established to 192.168.1.100
        2023-12-25 10:30:17 ERROR [com.example.PaymentService] Payment failed: java.lang.NullPointerException at line 45
        2023-12-25 10:30:18 WARN [com.example.SecurityService] Multiple failed login attempts from 192.168.1.200
        2023-12-25 10:30:19 ERROR [com.example.UserService] SQL injection attempt detected from 10.0.0.50
        2023-12-25 10:30:20 INFO [com.example.OrderService] Order created successfully: order456
        2023-12-25 10:30:21 ERROR [com.example.DatabaseConnection] Connection timeout: java.sql.SQLException
        2023-12-25 10:30:22 WARN [com.example.SecurityService] Suspicious XSS attack pattern detected
        2023-12-25 10:30:23 INFO [com.example.UserService] User logout: user123
        2023-12-25 10:30:24 ERROR [com.example.PaymentService] Unauthorized access attempt from 172.16.0.10
    """.trimIndent()
    
    val analyzer = LogAnalyzer()
    
    // 解析日志
    val logEntries = analyzer.parseLogFile(logContent)
    println("解析了 ${logEntries.size} 条日志记录")
    println()
    
    // 生成分析报告
    val report = analyzer.generateReport(logEntries)
    println(report)
}

性能优化

正则表达式优化技巧

kotlin
import kotlin.system.measureTimeMillis

class RegexPerformanceDemo {
    
    fun demonstrateCompilation() {
        println("=== 正则表达式编译优化 ===")
        
        val testText = "The quick brown fox jumps over the lazy dog. " * 1000
        val pattern = """\b\w{5}\b"""  // 匹配5个字母的单词
        
        // 不好的做法:每次都重新编译
        val time1 = measureTimeMillis {
            repeat(1000) {
                pattern.toRegex().findAll(testText).count()
            }
        }
        
        // 好的做法:预编译正则表达式
        val compiledRegex = pattern.toRegex()
        val time2 = measureTimeMillis {
            repeat(1000) {
                compiledRegex.findAll(testText).count()
            }
        }
        
        println("重复编译耗时: ${time1}ms")
        println("预编译耗时: ${time2}ms")
        println("性能提升: ${(time1.toDouble() / time2).format(2)}倍")
        println()
    }
    
    fun demonstrateAnchoring() {
        println("=== 锚点优化 ===")
        
        val testStrings = (1..10000).map { "test string $it with some content" }
        
        // 不好的做法:没有锚点
        val unanchoredRegex = Regex("""test string \d+""")
        
        // 好的做法:使用锚点
        val anchoredRegex = Regex("""^test string \d+""")
        
        val time1 = measureTimeMillis {
            testStrings.forEach { unanchoredRegex.containsMatchIn(it) }
        }
        
        val time2 = measureTimeMillis {
            testStrings.forEach { anchoredRegex.containsMatchIn(it) }
        }
        
        println("无锚点耗时: ${time1}ms")
        println("有锚点耗时: ${time2}ms")
        println("性能提升: ${(time1.toDouble() / time2).format(2)}倍")
        println()
    }
    
    fun demonstrateAlternatives() {
        println("=== 选择优化 ===")
        
        val testText = "apple banana cherry date elderberry fig grape"
        
        // 不好的做法:复杂的选择
        val complexRegex = Regex("""apple|banana|cherry|date|elderberry|fig|grape""")
        
        // 好的做法:优化的选择(按频率排序)
        val optimizedRegex = Regex("""apple|banana|cherry|grape|date|fig|elderberry""")
        
        // 更好的做法:使用字符类
        val characterClassRegex = Regex("""\b[abcdefg]\w+""")
        
        val iterations = 100000
        
        val time1 = measureTimeMillis {
            repeat(iterations) { complexRegex.findAll(testText).count() }
        }
        
        val time2 = measureTimeMillis {
            repeat(iterations) { optimizedRegex.findAll(testText).count() }
        }
        
        val time3 = measureTimeMillis {
            repeat(iterations) { characterClassRegex.findAll(testText).count() }
        }
        
        println("复杂选择耗时: ${time1}ms")
        println("优化选择耗时: ${time2}ms")
        println("字符类耗时: ${time3}ms")
        println()
    }
    
    private fun Double.format(digits: Int) = "%.${digits}f".format(this)
}

fun main() {
    val demo = RegexPerformanceDemo()
    
    demo.demonstrateCompilation()
    demo.demonstrateAnchoring()
    demo.demonstrateAlternatives()
    
    println("=== 最佳实践总结 ===")
    println("1. 预编译经常使用的正则表达式")
    println("2. 使用锚点(^, $)限制搜索范围")
    println("3. 优化选择的顺序,常见的放在前面")
    println("4. 使用字符类代替复杂的选择")
    println("5. 避免过度使用回溯")
    println("6. 使用非捕获组(?:)当不需要捕获时")
}

最佳实践

1. 正则表达式设计原则

kotlin
// 好的做法:清晰、可读的正则表达式
class RegexBestPractices {
    
    companion object {
        // 使用命名常量
        val EMAIL_REGEX = Regex("""
            ^[a-zA-Z0-9._%+-]+    # 用户名部分
            @                     # @ 符号
            [a-zA-Z0-9.-]+        # 域名部分
            \.                    # 点号
            [a-zA-Z]{2,}$         # 顶级域名
        """.trimIndent(), RegexOption.COMMENTS)
        
        // 分解复杂模式
        private val PHONE_AREA_CODE = """\d{3}"""
        private val PHONE_EXCHANGE = """\d{3}"""
        private val PHONE_NUMBER = """\d{4}"""
        val PHONE_REGEX = Regex("^$PHONE_AREA_CODE-$PHONE_EXCHANGE-$PHONE_NUMBER$")
    }
    
    // 提供验证方法而不是直接暴露正则表达式
    fun isValidEmail(email: String): Boolean = EMAIL_REGEX.matches(email)
    fun isValidPhone(phone: String): Boolean = PHONE_REGEX.matches(phone)
}

2. 错误处理

kotlin
fun safeRegexOperation(pattern: String, text: String): Result<List<String>> {
    return try {
        val regex = Regex(pattern)
        val matches = regex.findAll(text).map { it.value }.toList()
        Result.success(matches)
    } catch (e: Exception) {
        Result.failure(e)
    }
}

3. 测试正则表达式

kotlin
class RegexTester {
    
    fun testEmailRegex() {
        val emailRegex = Regex("""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$""")
        
        val validEmails = listOf(
            "user@example.com",
            "test.email@domain.org",
            "user+tag@example.co.uk"
        )
        
        val invalidEmails = listOf(
            "invalid-email",
            "@domain.com",
            "user@",
            "user@domain"
        )
        
        println("=== 邮箱正则表达式测试 ===")
        
        validEmails.forEach { email ->
            val isValid = emailRegex.matches(email)
            println("$email: ${if (isValid) "✓" else "✗"}")
            assert(isValid) { "应该匹配: $email" }
        }
        
        invalidEmails.forEach { email ->
            val isValid = emailRegex.matches(email)
            println("$email: ${if (!isValid) "✓" else "✗"}")
            assert(!isValid) { "不应该匹配: $email" }
        }
    }
}

下一步

掌握了正则表达式后,让我们学习Kotlin的标准库,了解丰富的内置函数和工具类。

下一章: 标准库

练习题

  1. 编写一个文本处理工具,支持多种格式的数据提取和转换
  2. 创建一个表单验证系统,验证各种用户输入格式
  3. 实现一个简单的模板引擎,支持变量替换和条件渲染
  4. 设计一个代码格式化工具,使用正则表达式美化代码
  5. 开发一个日志监控系统,实时分析日志文件并报警异常模式

本站内容仅供学习和研究使用。