Skip to content

Go 语言正则表达式

正则表达式是处理文本模式匹配的强大工具。Go 语言通过 regexp 包提供了完整的正则表达式支持。

📋 正则表达式基础

基本匹配和查找

go
package main

import (
    "fmt"
    "regexp"
)

func basicMatching() {
    fmt.Println("=== 基本正则匹配 ===")
    
    // 简单匹配
    pattern := "Go"
    text := "Go is a programming language"
    
    matched, _ := regexp.MatchString(pattern, text)
    fmt.Printf("'%s' 匹配 '%s': %v\n", pattern, text, matched)
    
    // 编译正则表达式
    re := regexp.MustCompile(`\b[Gg]o\b`) // 匹配单词 "Go" 或 "go"
    
    testTexts := []string{
        "Go is great",
        "I love go programming", 
        "going somewhere",
        "Let's go!",
    }
    
    for _, text := range testTexts {
        if re.MatchString(text) {
            fmt.Printf("✅ '%s'\n", text)
        } else {
            fmt.Printf("❌ '%s'\n", text)
        }
    }
    
    // 查找所有匹配
    numberRe := regexp.MustCompile(`\d+`)
    numText := "我有 3 个苹果和 15 个橙子"
    
    first := numberRe.FindString(numText)
    all := numberRe.FindAllString(numText, -1)
    
    fmt.Printf("第一个数字: %s\n", first)
    fmt.Printf("所有数字: %v\n", all)
}

func main() {
    basicMatching()
}

捕获组和替换

go
package main

import (
    "fmt"
    "regexp"
)

func captureAndReplace() {
    fmt.Println("=== 捕获组和替换 ===")
    
    // 日期捕获
    dateRe := regexp.MustCompile(`(\d{4})-(\d{2})-(\d{2})`)
    dateText := "今天是 2023-12-25"
    
    match := dateRe.FindStringSubmatch(dateText)
    if match != nil {
        fmt.Printf("完整匹配: %s\n", match[0])
        fmt.Printf("年: %s, 月: %s, 日: %s\n", match[1], match[2], match[3])
    }
    
    // 命名捕获组
    namedRe := regexp.MustCompile(`(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})`)
    match = namedRe.FindStringSubmatch("生日: 1990-05-15")
    
    if match != nil {
        names := namedRe.SubexpNames()
        for i, name := range names {
            if i > 0 && name != "" {
                fmt.Printf("%s: %s\n", name, match[i])
            }
        }
    }
    
    // 字符串替换
    fmt.Println("\n字符串替换:")
    
    // 简单替换
    catRe := regexp.MustCompile(`\bcat\b`)
    text := "The cat sat on the mat. Another cat was nearby."
    replaced := catRe.ReplaceAllString(text, "dog")
    
    fmt.Printf("原文: %s\n", text)
    fmt.Printf("替换后: %s\n", replaced)
    
    // 使用捕获组替换 (YYYY-MM-DD -> MM/DD/YYYY)
    dateText2 := "日期: 2023-12-25 和 2023-01-01"
    formatted := dateRe.ReplaceAllString(dateText2, "$2/$3/$1")
    
    fmt.Printf("原格式: %s\n", dateText2)
    fmt.Printf("新格式: %s\n", formatted)
}

func main() {
    captureAndReplace()
}

🎯 数据验证

常用验证模式

go
package main

import (
    "fmt"
    "regexp"
)

// 验证器
type Validator struct {
    patterns map[string]*regexp.Regexp
}

func NewValidator() *Validator {
    return &Validator{
        patterns: map[string]*regexp.Regexp{
            "email":    regexp.MustCompile(`^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$`),
            "phone":    regexp.MustCompile(`^1[3-9]\d{9}$`), // 中国手机号
            "password": regexp.MustCompile(`^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$`),
            "url":      regexp.MustCompile(`^https?:\/\/[^\s/$.?#].[^\s]*$`),
            "ipv4":     regexp.MustCompile(`^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$`),
        },
    }
}

func (v *Validator) Validate(dataType, value string) bool {
    if pattern, exists := v.patterns[dataType]; exists {
        return pattern.MatchString(value)
    }
    return false
}

func validationDemo() {
    fmt.Println("=== 数据验证演示 ===")
    
    validator := NewValidator()
    
    testData := map[string][]string{
        "email": {
            "user@example.com",     // ✅
            "invalid-email",        // ❌
            "test@domain.co.uk",    // ✅
        },
        "phone": {
            "13812345678",          // ✅
            "1234567890",           // ❌
            "15987654321",          // ✅
        },
        "password": {
            "Password123!",         // ✅
            "password",             // ❌
            "ComplexPass@2023",     // ✅
        },
        "url": {
            "https://www.example.com",  // ✅
            "invalid-url",              // ❌
            "http://api.service.org",   // ✅
        },
    }
    
    for dataType, values := range testData {
        fmt.Printf("\n%s 验证:\n", dataType)
        for _, value := range values {
            isValid := validator.Validate(dataType, value)
            status := "❌"
            if isValid {
                status = "✅"
            }
            fmt.Printf("  %s '%s'\n", status, value)
        }
    }
}

func main() {
    validationDemo()
}

🔧 文本处理

文本清理和处理

go
package main

import (
    "fmt"
    "regexp"
    "strings"
)

type TextProcessor struct {
    htmlTags    *regexp.Regexp
    whitespace  *regexp.Regexp
    numbers     *regexp.Regexp
    emails      *regexp.Regexp
}

func NewTextProcessor() *TextProcessor {
    return &TextProcessor{
        htmlTags:   regexp.MustCompile(`<[^>]*>`),
        whitespace: regexp.MustCompile(`\s+`),
        numbers:    regexp.MustCompile(`\d+`),
        emails:     regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`),
    }
}

func (tp *TextProcessor) RemoveHTMLTags(text string) string {
    return tp.htmlTags.ReplaceAllString(text, "")
}

func (tp *TextProcessor) NormalizeWhitespace(text string) string {
    return strings.TrimSpace(tp.whitespace.ReplaceAllString(text, " "))
}

func (tp *TextProcessor) ExtractNumbers(text string) []string {
    return tp.numbers.FindAllString(text, -1)
}

func (tp *TextProcessor) ExtractEmails(text string) []string {
    return tp.emails.FindAllString(text, -1)
}

func (tp *TextProcessor) MaskSensitiveData(text string) string {
    // 隐藏邮箱
    emailMasked := tp.emails.ReplaceAllStringFunc(text, func(email string) string {
        parts := strings.Split(email, "@")
        if len(parts) == 2 {
            user := parts[0]
            domain := parts[1]
            if len(user) > 2 {
                return user[:2] + "***@" + domain
            }
        }
        return "***@***"
    })
    
    // 隐藏手机号
    phoneRe := regexp.MustCompile(`1[3-9]\d{9}`)
    return phoneRe.ReplaceAllStringFunc(emailMasked, func(phone string) string {
        return phone[:3] + "****" + phone[7:]
    })
}

func textProcessingDemo() {
    fmt.Println("=== 文本处理演示 ===")
    
    processor := NewTextProcessor()
    
    // HTML 清理
    htmlText := `<p>联系我们:<strong>邮箱</strong> admin@company.com</p>
                 <div>电话:<span>13812345678</span></div>`
    
    fmt.Printf("原文: %s\n", htmlText)
    
    cleaned := processor.RemoveHTMLTags(htmlText)
    fmt.Printf("移除HTML: %s\n", cleaned)
    
    normalized := processor.NormalizeWhitespace(cleaned)
    fmt.Printf("规范空白: %s\n", normalized)
    
    // 数据提取
    fmt.Println("\n数据提取:")
    
    text := "联系方式: john@example.com, admin@company.org, 电话: 13812345678, 价格: 299.99"
    
    emails := processor.ExtractEmails(text)
    numbers := processor.ExtractNumbers(text)
    
    fmt.Printf("原文: %s\n", text)
    fmt.Printf("邮箱: %v\n", emails)
    fmt.Printf("数字: %v\n", numbers)
    
    // 敏感数据脱敏
    fmt.Println("\n敏感数据脱敏:")
    sensitiveText := "用户邮箱: alice@example.com, 手机: 13987654321"
    
    fmt.Printf("原文: %s\n", sensitiveText)
    masked := processor.MaskSensitiveData(sensitiveText)
    fmt.Printf("脱敏后: %s\n", masked)
}

func main() {
    textProcessingDemo()
}

🎯 实际应用示例

日志解析器

go
package main

import (
    "fmt"
    "regexp"
    "strconv"
    "strings"
    "time"
)

type LogEntry struct {
    Timestamp time.Time
    Level     string
    IP        string
    Message   string
    Status    int
}

type LogParser struct {
    accessLogRe *regexp.Regexp
    errorLogRe  *regexp.Regexp
    ipRe        *regexp.Regexp
}

func NewLogParser() *LogParser {
    return &LogParser{
        // 简化的访问日志格式: IP [时间] "请求" 状态码
        accessLogRe: regexp.MustCompile(`^(\S+) \[([^\]]+)\] "([^"]*)" (\d+)`),
        
        // 错误日志格式: [时间] 级别: 消息
        errorLogRe: regexp.MustCompile(`^\[([^\]]+)\] (\w+): (.+)`),
        
        // IP 地址
        ipRe: regexp.MustCompile(`\b(?:\d{1,3}\.){3}\d{1,3}\b`),
    }
}

func (lp *LogParser) ParseAccessLog(line string) (*LogEntry, error) {
    matches := lp.accessLogRe.FindStringSubmatch(line)
    if len(matches) < 5 {
        return nil, fmt.Errorf("无法解析访问日志")
    }
    
    timestamp, _ := time.Parse("02/Jan/2006:15:04:05", matches[2])
    status, _ := strconv.Atoi(matches[4])
    
    return &LogEntry{
        Timestamp: timestamp,
        Level:     "ACCESS",
        IP:        matches[1],
        Message:   matches[3],
        Status:    status,
    }, nil
}

func (lp *LogParser) ParseErrorLog(line string) (*LogEntry, error) {
    matches := lp.errorLogRe.FindStringSubmatch(line)
    if len(matches) < 4 {
        return nil, fmt.Errorf("无法解析错误日志")
    }
    
    timestamp, _ := time.Parse("2006-01-02 15:04:05", matches[1])
    
    // 提取IP地址
    ip := ""
    if ips := lp.ipRe.FindAllString(matches[3], 1); len(ips) > 0 {
        ip = ips[0]
    }
    
    return &LogEntry{
        Timestamp: timestamp,
        Level:     matches[2],
        IP:        ip,
        Message:   matches[3],
        Status:    500,
    }, nil
}

func (lp *LogParser) AnalyzeLogs(entries []LogEntry) map[string]interface{} {
    stats := map[string]interface{}{
        "total":        len(entries),
        "levels":       make(map[string]int),
        "status_codes": make(map[string]int),
        "unique_ips":   make(map[string]bool),
        "errors":       0,
    }
    
    levels := stats["levels"].(map[string]int)
    statusCodes := stats["status_codes"].(map[string]int)
    uniqueIPs := stats["unique_ips"].(map[string]bool)
    errors := 0
    
    for _, entry := range entries {
        levels[entry.Level]++
        
        statusKey := strconv.Itoa(entry.Status)
        statusCodes[statusKey]++
        
        if entry.IP != "" {
            uniqueIPs[entry.IP] = true
        }
        
        if entry.Status >= 400 || strings.ToUpper(entry.Level) == "ERROR" {
            errors++
        }
    }
    
    stats["errors"] = errors
    stats["unique_ip_count"] = len(uniqueIPs)
    
    return stats
}

func logParsingDemo() {
    fmt.Println("=== 日志解析演示 ===")
    
    parser := NewLogParser()
    
    // 模拟日志数据
    accessLogs := []string{
        `192.168.1.100 [25/Dec/2023:10:00:00] "GET /index.html" 200`,
        `192.168.1.101 [25/Dec/2023:10:01:00] "POST /api/login" 200`,
        `192.168.1.102 [25/Dec/2023:10:02:00] "GET /admin" 403`,
        `192.168.1.100 [25/Dec/2023:10:03:00] "GET /notfound" 404`,
    }
    
    errorLogs := []string{
        `[2023-12-25 10:05:00] ERROR: Database connection failed from 192.168.1.103`,
        `[2023-12-25 10:06:00] WARN: High memory usage detected`,
        `[2023-12-25 10:07:00] ERROR: Auth failed for 192.168.1.104`,
    }
    
    var allEntries []LogEntry
    
    // 解析访问日志
    fmt.Println("解析访问日志:")
    for _, line := range accessLogs {
        if entry, err := parser.ParseAccessLog(line); err == nil {
            allEntries = append(allEntries, *entry)
            fmt.Printf("✅ %s [%d] %s from %s\n", 
                      entry.Level, entry.Status, entry.Message, entry.IP)
        }
    }
    
    // 解析错误日志
    fmt.Println("\n解析错误日志:")
    for _, line := range errorLogs {
        if entry, err := parser.ParseErrorLog(line); err == nil {
            allEntries = append(allEntries, *entry)
            fmt.Printf("✅ %s: %s\n", entry.Level, entry.Message)
        }
    }
    
    // 统计分析
    fmt.Println("\n=== 统计分析 ===")
    stats := parser.AnalyzeLogs(allEntries)
    
    fmt.Printf("总日志数: %v\n", stats["total"])
    fmt.Printf("错误数: %v\n", stats["errors"])
    fmt.Printf("唯一IP数: %v\n", stats["unique_ip_count"])
    
    fmt.Println("\n级别分布:")
    levels := stats["levels"].(map[string]int)
    for level, count := range levels {
        fmt.Printf("  %s: %d\n", level, count)
    }
    
    fmt.Println("\n状态码分布:")
    statusCodes := stats["status_codes"].(map[string]int)
    for code, count := range statusCodes {
        fmt.Printf("  %s: %d\n", code, count)
    }
}

func main() {
    logParsingDemo()
}

🎓 小结

本章我们学习了 Go 语言的正则表达式:

  • 基础匹配:模式编译、字符串匹配、查找操作
  • 捕获组:捕获组、命名组、模式替换
  • 数据验证:邮箱、手机号、密码等常用验证
  • 文本处理:HTML清理、数据提取、敏感信息脱敏
  • 实际应用:日志解析器实现

正则表达式是处理文本数据的强大工具,在数据验证、文本解析、日志分析等场景中广泛应用。


接下来,我们将学习 Go 类型断言,深入了解接口类型的动态检查机制。

正则表达式建议

  • 优先使用编译后的正则表达式提高性能
  • 合理使用捕获组,避免不必要的复杂度
  • 注意正则表达式的性能,避免回溯过多
  • 对于复杂验证,考虑组合多个简单模式

本站内容仅供学习和研究使用。