Julia 正则表达式
正则表达式是强大的文本模式匹配工具,Julia 提供了完整的正则表达式支持。
创建正则表达式
基本语法
julia
# 使用 r"" 语法创建正则表达式
pattern = r"hello"
println(typeof(pattern)) # Regex
# 匹配测试
text = "hello world"
println(occursin(pattern, text)) # true
# 带标志
pattern_i = r"hello"i # 忽略大小写
println(occursin(pattern_i, "HELLO")) # true正则表达式标志
julia
# i - 忽略大小写
r"hello"i
# m - 多行模式(^ 和 $ 匹配每行)
r"^hello"m
# s - 单行模式(. 匹配换行符)
r"a.b"s
# x - 扩展模式(允许空格和注释)
r"""
\d+ # 匹配数字
\s* # 可选空格
[a-z]+ # 匹配字母
"""x
# 组合多个标志
r"hello"ims基本匹配
occursin 检查匹配
julia
text = "Hello, Julia!"
# 简单检查
println(occursin(r"Julia", text)) # true
println(occursin(r"Python", text)) # false
# 忽略大小写
println(occursin(r"hello"i, text)) # truematch 查找匹配
julia
text = "My phone is 123-456-7890"
# 查找第一个匹配
m = match(r"\d{3}-\d{3}-\d{4}", text)
println(m) # RegexMatch("123-456-7890")
println(m.match) # "123-456-7890"
println(m.offset) # 13(匹配开始位置)
# 没有匹配返回 nothing
m = match(r"email", text)
println(m === nothing) # trueeachmatch 迭代所有匹配
julia
text = "apple: 5, banana: 3, cherry: 8"
# 查找所有数字
for m in eachmatch(r"\d+", text)
println(m.match)
end
# 输出: 5, 3, 8
# 收集为数组
matches = collect(eachmatch(r"\d+", text))
numbers = [m.match for m in matches]
println(numbers) # ["5", "3", "8"]正则表达式语法
字符类
julia
text = "a1B2c3"
# 数字
println(collect(eachmatch(r"\d", text))) # 1, 2, 3
# 字母
println(collect(eachmatch(r"[a-zA-Z]", text))) # a, B, c
# 常用字符类
# \d - 数字 [0-9]
# \D - 非数字
# \w - 单词字符 [a-zA-Z0-9_]
# \W - 非单词字符
# \s - 空白字符
# \S - 非空白字符
# . - 任意字符(除换行)量词
julia
# ? - 0或1个
# * - 0或多个
# + - 1或多个
# {n} - 恰好n个
# {n,} - 至少n个
# {n,m} - n到m个
text = "aaa ab abbb"
println(match(r"ab*", text).match) # "a"(b出现0次)
println(match(r"ab+", text).match) # "ab"
println(match(r"ab{2}", text).match) # "abb"锚点
julia
# ^ - 字符串开头
# $ - 字符串结尾
# \b - 单词边界
text = "Hello World"
println(occursin(r"^Hello", text)) # true
println(occursin(r"World$", text)) # true
println(occursin(r"\bWorld\b", text)) # true
# 多行模式
multiline = "line1\nline2"
for m in eachmatch(r"^line\d"m, multiline)
println(m.match)
end
# 输出: line1, line2分组
julia
text = "John Smith, Jane Doe"
# 捕获组
pattern = r"(\w+) (\w+)"
m = match(pattern, text)
println(m.match) # "John Smith"
println(m.captures) # ["John", "Smith"]
println(m[1]) # "John"
println(m[2]) # "Smith"
# 命名捕获组
pattern = r"(?<first>\w+) (?<last>\w+)"
m = match(pattern, text)
println(m[:first]) # "John"
println(m[:last]) # "Smith"非捕获组
julia
# (?:...) 分组但不捕获
text = "Mr. John Smith"
pattern = r"(?:Mr\.|Mrs\.|Ms\.) (\w+) (\w+)"
m = match(pattern, text)
println(m.captures) # ["John", "Smith"],不包含称谓选择
julia
# | - 或
text = "I have a cat and a dog"
pattern = r"cat|dog"
for m in eachmatch(pattern, text)
println(m.match)
end
# 输出: cat, dog字符串替换
replace 函数
julia
text = "Hello, World!"
# 简单替换
result = replace(text, r"World" => "Julia")
println(result) # "Hello, Julia!"
# 忽略大小写
result = replace("HELLO", r"hello"i => "hi")
println(result) # "hi"
# 使用捕获组
text = "2023-12-25"
result = replace(text, r"(\d{4})-(\d{2})-(\d{2})" => s"\2/\3/\1")
println(result) # "12/25/2023"
# 使用函数
text = "hello world"
result = replace(text, r"\w+" => uppercase)
println(result) # "HELLO WORLD"复杂替换
julia
# 使用函数处理匹配
text = "Price: $100 and $200"
function double_price(m)
price = parse(Int, m.match)
return string(price * 2)
end
result = replace(text, r"\d+" => m -> string(parse(Int, m.match) * 2))
println(result) # "Price: $200 and $400"分割字符串
julia
text = "apple, banana; cherry: date"
# 按多个分隔符分割
parts = split(text, r"[,;:]\s*")
println(parts) # ["apple", "banana", "cherry", "date"]
# 保留分隔符
parts = split(text, r"[,;:]", keepempty=true)
println(parts)
# 按空白分割
text = "hello world\tjulia"
parts = split(text, r"\s+")
println(parts) # ["hello", "world", "julia"]实用模式
验证模式
julia
# 电子邮件验证
function is_valid_email(email)
pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
return occursin(pattern, email)
end
println(is_valid_email("test@example.com")) # true
println(is_valid_email("invalid-email")) # false
# 电话号码验证
function is_valid_phone(phone)
pattern = r"^\d{3}-\d{3}-\d{4}$"
return occursin(pattern, phone)
end
println(is_valid_phone("123-456-7890")) # true
println(is_valid_phone("12345")) # false
# URL 验证
function is_valid_url(url)
pattern = r"^https?://[\w\.-]+(?:/[\w\.-]*)*$"
return occursin(pattern, url)
end
println(is_valid_url("https://example.com/path")) # true提取模式
julia
# 提取所有数字
function extract_numbers(text)
return [parse(Int, m.match) for m in eachmatch(r"\d+", text)]
end
println(extract_numbers("I have 3 apples and 5 oranges"))
# [3, 5]
# 提取所有单词
function extract_words(text)
return [m.match for m in eachmatch(r"\b\w+\b", text)]
end
println(extract_words("Hello, World!"))
# ["Hello", "World"]
# 提取键值对
function extract_pairs(text)
pairs = Dict{String, String}()
for m in eachmatch(r"(\w+)=(\w+)", text)
pairs[m[1]] = m[2]
end
return pairs
end
println(extract_pairs("name=Alice age=30"))
# Dict("name" => "Alice", "age" => "30")清理文本
julia
# 移除 HTML 标签
function strip_html(html)
return replace(html, r"<[^>]+>" => "")
end
println(strip_html("<p>Hello <b>World</b></p>"))
# "Hello World"
# 规范化空白
function normalize_whitespace(text)
return strip(replace(text, r"\s+" => " "))
end
println(normalize_whitespace(" hello world "))
# "hello world"
# 移除非字母数字字符
function remove_special(text)
return replace(text, r"[^\w\s]" => "")
end
println(remove_special("Hello, World! @2023"))
# "Hello World 2023"高级特性
后向引用
julia
# 匹配重复单词
text = "the the quick brown fox fox"
pattern = r"\b(\w+)\s+\1\b"
for m in eachmatch(pattern, text)
println("重复单词: $(m.match)")
end
# 输出: "the the", "fox fox"前瞻和后顾
julia
# 正向前瞻 (?=...)
# 匹配后面跟着 "bar" 的 "foo"
text = "foobar foobaz"
pattern = r"foo(?=bar)"
m = match(pattern, text)
println(m.match) # "foo"
# 负向前瞻 (?!...)
# 匹配后面不是 "bar" 的 "foo"
pattern = r"foo(?!bar)"
m = match(pattern, text)
println(m.offset) # 8 (第二个 foo)
# 正向后顾 (?<=...)
text = "USD100 EUR200"
pattern = r"(?<=USD)\d+"
m = match(pattern, text)
println(m.match) # "100"
# 负向后顾 (?<!...)
pattern = r"(?<!USD)\d+" # 前面不是 USD
m = match(pattern, text)
println(m.match) # "200"非贪婪匹配
julia
text = "<div>content</div>"
# 贪婪匹配
m = match(r"<.*>", text)
println(m.match) # "<div>content</div>"
# 非贪婪匹配(添加 ?)
m = match(r"<.*?>", text)
println(m.match) # "<div>"性能优化
编译一次,多次使用
julia
# 将正则表达式存储为常量
const EMAIL_PATTERN = r"^[\w\.-]+@[\w\.-]+\.\w+$"
function validate_email(email)
return occursin(EMAIL_PATTERN, email)
end
# 批量验证
emails = ["a@b.com", "invalid", "x@y.org"]
for email in emails
println("$email: $(validate_email(email))")
end避免过度复杂的模式
julia
# 复杂模式可能导致性能问题
# 使用更简单的模式,必要时分步处理
# 不好的做法:一个复杂正则
# r"^(?=.*[A-Z])(?=.*[a-z])(?=.*\d).{8,}$"
# 更好的做法:分步检查
function is_strong_password(pwd)
length(pwd) >= 8 || return false
occursin(r"[A-Z]", pwd) || return false
occursin(r"[a-z]", pwd) || return false
occursin(r"\d", pwd) || return false
return true
end常用正则表达式
julia
# 整数
r"-?\d+"
# 浮点数
r"-?\d+\.?\d*"
# 电子邮件
r"[\w\.-]+@[\w\.-]+\.\w+"
# URL
r"https?://[\w\.-]+(?:/[\w\.-]*)?"
# IP 地址
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
# 日期 (YYYY-MM-DD)
r"\d{4}-\d{2}-\d{2}"
# 时间 (HH:MM:SS)
r"\d{2}:\d{2}:\d{2}"
# 中文字符
r"[\u4e00-\u9fff]+"下一步
学习完正则表达式后,请继续学习: