Skip to content

Julia 正则表达式

正则表达式是强大的文本模式匹配工具,Julia 提供了完整的正则表达式支持。

创建正则表达式

基本语法

julia
# 使用 r"" 语法创建正则表达式
pattern = r"hello"
println(typeof(pattern))  # Regex

# 匹配测试
text = "hello world"
println(occursin(pattern, text))  # true

# 带标志
pattern_i = r"hello"i  # 忽略大小写
println(occursin(pattern_i, "HELLO"))  # true

正则表达式标志

julia
# i - 忽略大小写
r"hello"i

# m - 多行模式(^ 和 $ 匹配每行)
r"^hello"m

# s - 单行模式(. 匹配换行符)
r"a.b"s

# x - 扩展模式(允许空格和注释)
r"""
  \d+    # 匹配数字
  \s*    # 可选空格
  [a-z]+ # 匹配字母
"""x

# 组合多个标志
r"hello"ims

基本匹配

occursin 检查匹配

julia
text = "Hello, Julia!"

# 简单检查
println(occursin(r"Julia", text))   # true
println(occursin(r"Python", text))  # false

# 忽略大小写
println(occursin(r"hello"i, text))  # true

match 查找匹配

julia
text = "My phone is 123-456-7890"

# 查找第一个匹配
m = match(r"\d{3}-\d{3}-\d{4}", text)
println(m)          # RegexMatch("123-456-7890")
println(m.match)    # "123-456-7890"
println(m.offset)   # 13(匹配开始位置)

# 没有匹配返回 nothing
m = match(r"email", text)
println(m === nothing)  # true

eachmatch 迭代所有匹配

julia
text = "apple: 5, banana: 3, cherry: 8"

# 查找所有数字
for m in eachmatch(r"\d+", text)
    println(m.match)
end
# 输出: 5, 3, 8

# 收集为数组
matches = collect(eachmatch(r"\d+", text))
numbers = [m.match for m in matches]
println(numbers)  # ["5", "3", "8"]

正则表达式语法

字符类

julia
text = "a1B2c3"

# 数字
println(collect(eachmatch(r"\d", text)))  # 1, 2, 3

# 字母
println(collect(eachmatch(r"[a-zA-Z]", text)))  # a, B, c

# 常用字符类
# \d  - 数字 [0-9]
# \D  - 非数字
# \w  - 单词字符 [a-zA-Z0-9_]
# \W  - 非单词字符
# \s  - 空白字符
# \S  - 非空白字符
# .   - 任意字符(除换行)

量词

julia
# ?   - 0或1个
# *   - 0或多个
# +   - 1或多个
# {n} - 恰好n个
# {n,} - 至少n个
# {n,m} - n到m个

text = "aaa ab abbb"

println(match(r"ab*", text).match)   # "a"(b出现0次)
println(match(r"ab+", text).match)   # "ab"
println(match(r"ab{2}", text).match) # "abb"

锚点

julia
# ^  - 字符串开头
# $  - 字符串结尾
# \b - 单词边界

text = "Hello World"

println(occursin(r"^Hello", text))   # true
println(occursin(r"World$", text))   # true
println(occursin(r"\bWorld\b", text)) # true

# 多行模式
multiline = "line1\nline2"
for m in eachmatch(r"^line\d"m, multiline)
    println(m.match)
end
# 输出: line1, line2

分组

julia
text = "John Smith, Jane Doe"

# 捕获组
pattern = r"(\w+) (\w+)"
m = match(pattern, text)

println(m.match)      # "John Smith"
println(m.captures)   # ["John", "Smith"]
println(m[1])         # "John"
println(m[2])         # "Smith"

# 命名捕获组
pattern = r"(?<first>\w+) (?<last>\w+)"
m = match(pattern, text)
println(m[:first])    # "John"
println(m[:last])     # "Smith"

非捕获组

julia
# (?:...) 分组但不捕获
text = "Mr. John Smith"
pattern = r"(?:Mr\.|Mrs\.|Ms\.) (\w+) (\w+)"
m = match(pattern, text)
println(m.captures)  # ["John", "Smith"],不包含称谓

选择

julia
# | - 或
text = "I have a cat and a dog"

pattern = r"cat|dog"
for m in eachmatch(pattern, text)
    println(m.match)
end
# 输出: cat, dog

字符串替换

replace 函数

julia
text = "Hello, World!"

# 简单替换
result = replace(text, r"World" => "Julia")
println(result)  # "Hello, Julia!"

# 忽略大小写
result = replace("HELLO", r"hello"i => "hi")
println(result)  # "hi"

# 使用捕获组
text = "2023-12-25"
result = replace(text, r"(\d{4})-(\d{2})-(\d{2})" => s"\2/\3/\1")
println(result)  # "12/25/2023"

# 使用函数
text = "hello world"
result = replace(text, r"\w+" => uppercase)
println(result)  # "HELLO WORLD"

复杂替换

julia
# 使用函数处理匹配
text = "Price: $100 and $200"

function double_price(m)
    price = parse(Int, m.match)
    return string(price * 2)
end

result = replace(text, r"\d+" => m -> string(parse(Int, m.match) * 2))
println(result)  # "Price: $200 and $400"

分割字符串

julia
text = "apple, banana; cherry: date"

# 按多个分隔符分割
parts = split(text, r"[,;:]\s*")
println(parts)  # ["apple", "banana", "cherry", "date"]

# 保留分隔符
parts = split(text, r"[,;:]", keepempty=true)
println(parts)

# 按空白分割
text = "hello   world\tjulia"
parts = split(text, r"\s+")
println(parts)  # ["hello", "world", "julia"]

实用模式

验证模式

julia
# 电子邮件验证
function is_valid_email(email)
    pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
    return occursin(pattern, email)
end

println(is_valid_email("test@example.com"))   # true
println(is_valid_email("invalid-email"))      # false

# 电话号码验证
function is_valid_phone(phone)
    pattern = r"^\d{3}-\d{3}-\d{4}$"
    return occursin(pattern, phone)
end

println(is_valid_phone("123-456-7890"))  # true
println(is_valid_phone("12345"))         # false

# URL 验证
function is_valid_url(url)
    pattern = r"^https?://[\w\.-]+(?:/[\w\.-]*)*$"
    return occursin(pattern, url)
end

println(is_valid_url("https://example.com/path"))  # true

提取模式

julia
# 提取所有数字
function extract_numbers(text)
    return [parse(Int, m.match) for m in eachmatch(r"\d+", text)]
end

println(extract_numbers("I have 3 apples and 5 oranges"))
# [3, 5]

# 提取所有单词
function extract_words(text)
    return [m.match for m in eachmatch(r"\b\w+\b", text)]
end

println(extract_words("Hello, World!"))
# ["Hello", "World"]

# 提取键值对
function extract_pairs(text)
    pairs = Dict{String, String}()
    for m in eachmatch(r"(\w+)=(\w+)", text)
        pairs[m[1]] = m[2]
    end
    return pairs
end

println(extract_pairs("name=Alice age=30"))
# Dict("name" => "Alice", "age" => "30")

清理文本

julia
# 移除 HTML 标签
function strip_html(html)
    return replace(html, r"<[^>]+>" => "")
end

println(strip_html("<p>Hello <b>World</b></p>"))
# "Hello World"

# 规范化空白
function normalize_whitespace(text)
    return strip(replace(text, r"\s+" => " "))
end

println(normalize_whitespace("  hello   world  "))
# "hello world"

# 移除非字母数字字符
function remove_special(text)
    return replace(text, r"[^\w\s]" => "")
end

println(remove_special("Hello, World! @2023"))
# "Hello World 2023"

高级特性

后向引用

julia
# 匹配重复单词
text = "the the quick brown fox fox"
pattern = r"\b(\w+)\s+\1\b"

for m in eachmatch(pattern, text)
    println("重复单词: $(m.match)")
end
# 输出: "the the", "fox fox"

前瞻和后顾

julia
# 正向前瞻 (?=...)
# 匹配后面跟着 "bar" 的 "foo"
text = "foobar foobaz"
pattern = r"foo(?=bar)"
m = match(pattern, text)
println(m.match)  # "foo"

# 负向前瞻 (?!...)
# 匹配后面不是 "bar" 的 "foo"
pattern = r"foo(?!bar)"
m = match(pattern, text)
println(m.offset)  # 8 (第二个 foo)

# 正向后顾 (?<=...)
text = "USD100 EUR200"
pattern = r"(?<=USD)\d+"
m = match(pattern, text)
println(m.match)  # "100"

# 负向后顾 (?<!...)
pattern = r"(?<!USD)\d+"  # 前面不是 USD
m = match(pattern, text)
println(m.match)  # "200"

非贪婪匹配

julia
text = "<div>content</div>"

# 贪婪匹配
m = match(r"<.*>", text)
println(m.match)  # "<div>content</div>"

# 非贪婪匹配(添加 ?)
m = match(r"<.*?>", text)
println(m.match)  # "<div>"

性能优化

编译一次,多次使用

julia
# 将正则表达式存储为常量
const EMAIL_PATTERN = r"^[\w\.-]+@[\w\.-]+\.\w+$"

function validate_email(email)
    return occursin(EMAIL_PATTERN, email)
end

# 批量验证
emails = ["a@b.com", "invalid", "x@y.org"]
for email in emails
    println("$email: $(validate_email(email))")
end

避免过度复杂的模式

julia
# 复杂模式可能导致性能问题
# 使用更简单的模式,必要时分步处理

# 不好的做法:一个复杂正则
# r"^(?=.*[A-Z])(?=.*[a-z])(?=.*\d).{8,}$"

# 更好的做法:分步检查
function is_strong_password(pwd)
    length(pwd) >= 8 || return false
    occursin(r"[A-Z]", pwd) || return false
    occursin(r"[a-z]", pwd) || return false
    occursin(r"\d", pwd) || return false
    return true
end

常用正则表达式

julia
# 整数
r"-?\d+"

# 浮点数
r"-?\d+\.?\d*"

# 电子邮件
r"[\w\.-]+@[\w\.-]+\.\w+"

# URL
r"https?://[\w\.-]+(?:/[\w\.-]*)?"

# IP 地址
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"

# 日期 (YYYY-MM-DD)
r"\d{4}-\d{2}-\d{2}"

# 时间 (HH:MM:SS)
r"\d{2}:\d{2}:\d{2}"

# 中文字符
r"[\u4e00-\u9fff]+"

下一步

学习完正则表达式后,请继续学习:

本站内容仅供学习和研究使用。