Julia 正则表达式

正则表达式是强大的文本模式匹配工具，Julia 提供了完整的正则表达式支持。

创建正则表达式

基本语法

julia

# 使用 r"" 语法创建正则表达式
pattern = r"hello"
println(typeof(pattern))  # Regex

# 匹配测试
text = "hello world"
println(occursin(pattern, text))  # true

# 带标志
pattern_i = r"hello"i  # 忽略大小写
println(occursin(pattern_i, "HELLO"))  # true

正则表达式标志

julia

# i - 忽略大小写
r"hello"i

# m - 多行模式（^ 和 $ 匹配每行）
r"^hello"m

# s - 单行模式（. 匹配换行符）
r"a.b"s

# x - 扩展模式（允许空格和注释）
r"""
  \d+    # 匹配数字
  \s*    # 可选空格
  [a-z]+ # 匹配字母
"""x

# 组合多个标志
r"hello"ims

基本匹配

occursin 检查匹配

julia

text = "Hello, Julia!"

# 简单检查
println(occursin(r"Julia", text))   # true
println(occursin(r"Python", text))  # false

# 忽略大小写
println(occursin(r"hello"i, text))  # true

match 查找匹配

julia

text = "My phone is 123-456-7890"

# 查找第一个匹配
m = match(r"\d{3}-\d{3}-\d{4}", text)
println(m)          # RegexMatch("123-456-7890")
println(m.match)    # "123-456-7890"
println(m.offset)   # 13（匹配开始位置）

# 没有匹配返回 nothing
m = match(r"email", text)
println(m === nothing)  # true

eachmatch 迭代所有匹配

julia

text = "apple: 5, banana: 3, cherry: 8"

# 查找所有数字
for m in eachmatch(r"\d+", text)
    println(m.match)
end
# 输出: 5, 3, 8

# 收集为数组
matches = collect(eachmatch(r"\d+", text))
numbers = [m.match for m in matches]
println(numbers)  # ["5", "3", "8"]

正则表达式语法

字符类

julia

text = "a1B2c3"

# 数字
println(collect(eachmatch(r"\d", text)))  # 1, 2, 3

# 字母
println(collect(eachmatch(r"[a-zA-Z]", text)))  # a, B, c

# 常用字符类
# \d  - 数字 [0-9]
# \D  - 非数字
# \w  - 单词字符 [a-zA-Z0-9_]
# \W  - 非单词字符
# \s  - 空白字符
# \S  - 非空白字符
# .   - 任意字符（除换行）

量词

julia

# ?   - 0或1个
# *   - 0或多个
# +   - 1或多个
# {n} - 恰好n个
# {n,} - 至少n个
# {n,m} - n到m个

text = "aaa ab abbb"

println(match(r"ab*", text).match)   # "a"（b出现0次）
println(match(r"ab+", text).match)   # "ab"
println(match(r"ab{2}", text).match) # "abb"

锚点

julia

# ^  - 字符串开头
# $  - 字符串结尾
# \b - 单词边界

text = "Hello World"

println(occursin(r"^Hello", text))   # true
println(occursin(r"World$", text))   # true
println(occursin(r"\bWorld\b", text)) # true

# 多行模式
multiline = "line1\nline2"
for m in eachmatch(r"^line\d"m, multiline)
    println(m.match)
end
# 输出: line1, line2

分组

julia

text = "John Smith, Jane Doe"

# 捕获组
pattern = r"(\w+) (\w+)"
m = match(pattern, text)

println(m.match)      # "John Smith"
println(m.captures)   # ["John", "Smith"]
println(m[1])         # "John"
println(m[2])         # "Smith"

# 命名捕获组
pattern = r"(?<first>\w+) (?<last>\w+)"
m = match(pattern, text)
println(m[:first])    # "John"
println(m[:last])     # "Smith"

非捕获组

julia

# (?:...) 分组但不捕获
text = "Mr. John Smith"
pattern = r"(?:Mr\.|Mrs\.|Ms\.) (\w+) (\w+)"
m = match(pattern, text)
println(m.captures)  # ["John", "Smith"]，不包含称谓

选择

julia

# | - 或
text = "I have a cat and a dog"

pattern = r"cat|dog"
for m in eachmatch(pattern, text)
    println(m.match)
end
# 输出: cat, dog

字符串替换

replace 函数

julia

text = "Hello, World!"

# 简单替换
result = replace(text, r"World" => "Julia")
println(result)  # "Hello, Julia!"

# 忽略大小写
result = replace("HELLO", r"hello"i => "hi")
println(result)  # "hi"

# 使用捕获组
text = "2023-12-25"
result = replace(text, r"(\d{4})-(\d{2})-(\d{2})" => s"\2/\3/\1")
println(result)  # "12/25/2023"

# 使用函数
text = "hello world"
result = replace(text, r"\w+" => uppercase)
println(result)  # "HELLO WORLD"

复杂替换

julia

# 使用函数处理匹配
text = "Price: $100 and $200"

function double_price(m)
    price = parse(Int, m.match)
    return string(price * 2)
end

result = replace(text, r"\d+" => m -> string(parse(Int, m.match) * 2))
println(result)  # "Price: $200 and $400"

分割字符串

julia

text = "apple, banana; cherry: date"

# 按多个分隔符分割
parts = split(text, r"[,;:]\s*")
println(parts)  # ["apple", "banana", "cherry", "date"]

# 保留分隔符
parts = split(text, r"[,;:]", keepempty=true)
println(parts)

# 按空白分割
text = "hello   world\tjulia"
parts = split(text, r"\s+")
println(parts)  # ["hello", "world", "julia"]

实用模式

验证模式

julia

# 电子邮件验证
function is_valid_email(email)
    pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
    return occursin(pattern, email)
end

println(is_valid_email("test@example.com"))   # true
println(is_valid_email("invalid-email"))      # false

# 电话号码验证
function is_valid_phone(phone)
    pattern = r"^\d{3}-\d{3}-\d{4}$"
    return occursin(pattern, phone)
end

println(is_valid_phone("123-456-7890"))  # true
println(is_valid_phone("12345"))         # false

# URL 验证
function is_valid_url(url)
    pattern = r"^https?://[\w\.-]+(?:/[\w\.-]*)*$"
    return occursin(pattern, url)
end

println(is_valid_url("https://example.com/path"))  # true

提取模式

julia

# 提取所有数字
function extract_numbers(text)
    return [parse(Int, m.match) for m in eachmatch(r"\d+", text)]
end

println(extract_numbers("I have 3 apples and 5 oranges"))
# [3, 5]

# 提取所有单词
function extract_words(text)
    return [m.match for m in eachmatch(r"\b\w+\b", text)]
end

println(extract_words("Hello, World!"))
# ["Hello", "World"]

# 提取键值对
function extract_pairs(text)
    pairs = Dict{String, String}()
    for m in eachmatch(r"(\w+)=(\w+)", text)
        pairs[m[1]] = m[2]
    end
    return pairs
end

println(extract_pairs("name=Alice age=30"))
# Dict("name" => "Alice", "age" => "30")

清理文本

julia

# 移除 HTML 标签
function strip_html(html)
    return replace(html, r"<[^>]+>" => "")
end

println(strip_html("<p>Hello <b>World</b></p>"))
# "Hello World"

# 规范化空白
function normalize_whitespace(text)
    return strip(replace(text, r"\s+" => " "))
end

println(normalize_whitespace("  hello   world  "))
# "hello world"

# 移除非字母数字字符
function remove_special(text)
    return replace(text, r"[^\w\s]" => "")
end

println(remove_special("Hello, World! @2023"))
# "Hello World 2023"

高级特性

后向引用

julia

# 匹配重复单词
text = "the the quick brown fox fox"
pattern = r"\b(\w+)\s+\1\b"

for m in eachmatch(pattern, text)
    println("重复单词: $(m.match)")
end
# 输出: "the the", "fox fox"

前瞻和后顾

julia

# 正向前瞻 (?=...)
# 匹配后面跟着 "bar" 的 "foo"
text = "foobar foobaz"
pattern = r"foo(?=bar)"
m = match(pattern, text)
println(m.match)  # "foo"

# 负向前瞻 (?!...)
# 匹配后面不是 "bar" 的 "foo"
pattern = r"foo(?!bar)"
m = match(pattern, text)
println(m.offset)  # 8 (第二个 foo)

# 正向后顾 (?<=...)
text = "USD100 EUR200"
pattern = r"(?<=USD)\d+"
m = match(pattern, text)
println(m.match)  # "100"

# 负向后顾 (?<!...)
pattern = r"(?<!USD)\d+"  # 前面不是 USD
m = match(pattern, text)
println(m.match)  # "200"

非贪婪匹配

julia

text = "<div>content</div>"

# 贪婪匹配
m = match(r"<.*>", text)
println(m.match)  # "<div>content</div>"

# 非贪婪匹配（添加 ?）
m = match(r"<.*?>", text)
println(m.match)  # "<div>"

性能优化

编译一次，多次使用

julia

# 将正则表达式存储为常量
const EMAIL_PATTERN = r"^[\w\.-]+@[\w\.-]+\.\w+$"

function validate_email(email)
    return occursin(EMAIL_PATTERN, email)
end

# 批量验证
emails = ["a@b.com", "invalid", "x@y.org"]
for email in emails
    println("$email: $(validate_email(email))")
end

避免过度复杂的模式

julia

# 复杂模式可能导致性能问题
# 使用更简单的模式，必要时分步处理

# 不好的做法：一个复杂正则
# r"^(?=.*[A-Z])(?=.*[a-z])(?=.*\d).{8,}$"

# 更好的做法：分步检查
function is_strong_password(pwd)
    length(pwd) >= 8 || return false
    occursin(r"[A-Z]", pwd) || return false
    occursin(r"[a-z]", pwd) || return false
    occursin(r"\d", pwd) || return false
    return true
end

常用正则表达式

julia

# 整数
r"-?\d+"

# 浮点数
r"-?\d+\.?\d*"

# 电子邮件
r"[\w\.-]+@[\w\.-]+\.\w+"

# URL
r"https?://[\w\.-]+(?:/[\w\.-]*)?"

# IP 地址
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"

# 日期 (YYYY-MM-DD)
r"\d{4}-\d{2}-\d{2}"

# 时间 (HH:MM:SS)
r"\d{2}:\d{2}:\d{2}"

# 中文字符
r"[\u4e00-\u9fff]+"

下一步

学习完正则表达式后，请继续学习：

文件读写 - 处理文件中的文本
字符串 - 更多字符串操作
函数 - 封装正则处理函数

Julia 正则表达式 ​

创建正则表达式 ​

基本语法 ​

正则表达式标志 ​

基本匹配 ​

occursin 检查匹配 ​

match 查找匹配 ​

eachmatch 迭代所有匹配 ​

正则表达式语法 ​

字符类 ​

量词 ​

锚点 ​

分组 ​

非捕获组 ​

选择 ​

字符串替换 ​

replace 函数 ​

复杂替换 ​

分割字符串 ​

实用模式 ​

验证模式 ​

提取模式 ​

清理文本 ​

高级特性 ​

后向引用 ​

前瞻和后顾 ​

非贪婪匹配 ​

性能优化 ​

编译一次，多次使用 ​

避免过度复杂的模式 ​

常用正则表达式 ​

下一步 ​

Julia 正则表达式

创建正则表达式

基本语法

正则表达式标志

基本匹配

occursin 检查匹配

match 查找匹配

eachmatch 迭代所有匹配

正则表达式语法

字符类

量词

锚点

分组

非捕获组

选择

字符串替换

replace 函数

复杂替换

分割字符串

实用模式

验证模式

提取模式

清理文本

高级特性

后向引用

前瞻和后顾

非贪婪匹配

性能优化

编译一次，多次使用

避免过度复杂的模式

常用正则表达式

下一步