正则表达式
概述
正则表达式(Regular Expression)是一种强大的文本模式匹配工具,PHP使用PCRE(Perl Compatible Regular Expressions)库来支持正则表达式。本章将学习如何使用正则表达式进行文本匹配、替换、分割和验证。
基础语法
PCRE函数介绍
php
<?php
// 主要的PCRE函数
// preg_match() - 执行匹配
// preg_match_all() - 执行全局匹配
// preg_replace() - 执行搜索和替换
// preg_split() - 用正则表达式分割字符串
// 基本匹配示例
$text = "Hello World 2024";
$pattern = '/World/';
if (preg_match($pattern, $text)) {
echo "找到匹配!\n";
}
// 获取匹配结果
if (preg_match('/(\d+)/', $text, $matches)) {
echo "找到数字: " . $matches[1] . "\n";
}
?>基本元字符
php
<?php
$text = "The price is $25.99 for item #123";
// . - 匹配任意字符(除换行符)
preg_match('/p.ice/', $text, $matches);
echo "匹配 '.': " . ($matches[0] ?? '无') . "\n";
// * - 匹配前面的字符0次或多次
preg_match('/\d*/', $text, $matches);
echo "匹配 '*': " . ($matches[0] ?? '无') . "\n";
// + - 匹配前面的字符1次或多次
preg_match('/\d+/', $text, $matches);
echo "匹配 '+': " . ($matches[0] ?? '无') . "\n";
// ? - 匹配前面的字符0次或1次
preg_match('/\$?\d+/', $text, $matches);
echo "匹配 '?': " . ($matches[0] ?? '无') . "\n";
?>字符类和预定义字符类
php
<?php
$text = "User ID: A123, Age: 25, Email: user@example.com";
// [abc] - 匹配字符集中的任意一个字符
preg_match('/[AEI]/', $text, $matches);
echo "字符类 [AEI]: " . ($matches[0] ?? '无') . "\n";
// [a-z] - 匹配范围内的字符
preg_match('/[a-z]+/', $text, $matches);
echo "字符类 [a-z]: " . ($matches[0] ?? '无') . "\n";
// 预定义字符类
// \d - 匹配数字 [0-9]
preg_match_all('/\d/', $text, $matches);
echo "数字字符: " . implode(', ', $matches[0]) . "\n";
// \w - 匹配单词字符 [a-zA-Z0-9_]
preg_match_all('/\w+/', $text, $matches);
echo "单词字符: " . implode(', ', $matches[0]) . "\n";
// \s - 匹配空白字符
preg_match_all('/\s/', $text, $matches);
echo "空白字符数量: " . count($matches[0]) . "\n";
?>常用验证模式
数据验证类
php
<?php
class Validator {
// 邮箱验证
public static function validateEmail($email) {
$pattern = '/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/';
return preg_match($pattern, $email);
}
// 手机号验证(中国大陆)
public static function validatePhone($phone) {
$pattern = '/^1[3-9]\d{9}$/';
return preg_match($pattern, $phone);
}
// 身份证号验证(简化版)
public static function validateIdCard($idCard) {
$pattern = '/^[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dX]$/';
return preg_match($pattern, $idCard);
}
// 密码强度验证
public static function validatePassword($password) {
// 至少8位,包含大小写字母、数字和特殊字符
$pattern = '/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/';
return preg_match($pattern, $password);
}
// URL验证
public static function validateUrl($url) {
$pattern = '/^https?:\/\/(?:[-\w.])+(?:\:[0-9]+)?(?:\/(?:[\w\/_.])*(?:\?(?:[\w&=%.])*)?(?:\#(?:[\w.])*)?)?$/';
return preg_match($pattern, $url);
}
// IP地址验证
public static function validateIP($ip) {
$pattern = '/^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/';
return preg_match($pattern, $ip);
}
}
// 测试验证器
$testData = [
'email' => 'user@example.com',
'phone' => '13800138000',
'password' => 'StrongP@ss123',
'url' => 'https://www.example.com',
'ip' => '192.168.1.1'
];
foreach ($testData as $type => $value) {
$method = 'validate' . ucfirst($type);
if (method_exists('Validator', $method)) {
$isValid = Validator::$method($value);
echo "$type ($value): " . ($isValid ? "有效" : "无效") . "\n";
}
}
?>文本处理和替换
搜索和替换
php
<?php
$text = "联系我们:电话 010-12345678,手机 138-0013-8000,邮箱 contact@example.com";
// 基本替换
$result = preg_replace('/\d{3}-\d{4}-\d{4}/', '***-****-****', $text);
echo "隐藏手机号: $result\n";
// 使用回调函数替换
$result = preg_replace_callback('/(\w+)@([\w.-]+)/', function($matches) {
return $matches[1] . '@***';
}, $text);
echo "隐藏邮箱域名: $result\n";
// 使用分组进行高级替换
$html = '<img src="image1.jpg" alt="图片1"><img src="image2.png" alt="图片2">';
$result = preg_replace('/<img src="([^"]+)" alt="([^"]+)">/', '<figure><img src="$1"><figcaption>$2</figcaption></figure>', $html);
echo "HTML转换: $result\n";
?>文本分割和提取
php
<?php
// 分割字符串
$text = "苹果,香蕉;橙子|葡萄 草莓";
$fruits = preg_split('/[,;|\s]+/', $text);
print_r($fruits);
// 提取日志信息
$log = "2024-01-15 10:30:45 [ERROR] Database connection failed";
$pattern = '/(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)/';
if (preg_match($pattern, $log, $matches)) {
echo "日期: " . $matches[1] . "\n";
echo "时间: " . $matches[2] . "\n";
echo "级别: " . $matches[3] . "\n";
echo "消息: " . $matches[4] . "\n";
}
// 提取所有邮箱地址
$text = "联系方式:admin@example.com, support@test.org, info@company.net";
preg_match_all('/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/', $text, $matches);
echo "找到的邮箱地址:\n";
foreach ($matches[0] as $email) {
echo "- $email\n";
}
?>高级功能
前瞻和后瞻断言
php
<?php
$text = "password123, admin456, user789, guest000";
// 正向前瞻 (?=...)
// 匹配后面跟着数字的单词
preg_match_all('/\w+(?=\d+)/', $text, $matches);
echo "后面跟数字的单词: " . implode(', ', $matches[0]) . "\n";
// 正向后瞻 (?<=...)
// 匹配前面有特定模式的内容
$text3 = "价格:$100, 费用:$50, 税:$10";
preg_match_all('/(?<=\$)\d+/', $text3, $matches);
echo "价格数字: " . implode(', ', $matches[0]) . "\n";
?>实际应用案例:日志解析器
php
<?php
class LogParser {
private $patterns = [
'apache' => '/^(\S+) \S+ \S+ \[([\w:\/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)/',
'custom' => '/^\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] (\w+): (.+)/'
];
public function parseLogLine($line, $type = 'custom') {
if (!isset($this->patterns[$type])) {
throw new InvalidArgumentException("不支持的日志类型: $type");
}
$pattern = $this->patterns[$type];
if (preg_match($pattern, trim($line), $matches)) {
return $this->formatLogEntry($matches, $type);
}
return null;
}
private function formatLogEntry($matches, $type) {
switch ($type) {
case 'apache':
return [
'ip' => $matches[1],
'timestamp' => $matches[2],
'method' => $matches[3],
'url' => $matches[4],
'status' => $matches[6],
'size' => $matches[7]
];
case 'custom':
return [
'timestamp' => $matches[1],
'level' => $matches[2],
'message' => $matches[3]
];
default:
return $matches;
}
}
}
// 使用示例
$parser = new LogParser();
$logLine = "[2024-01-15 10:30:45] ERROR: Database connection failed";
$parsed = $parser->parseLogLine($logLine);
if ($parsed) {
echo "时间: {$parsed['timestamp']}\n";
echo "级别: {$parsed['level']}\n";
echo "消息: {$parsed['message']}\n";
}
?>最佳实践和性能优化
错误处理和安全使用
php
<?php
// 安全的正则表达式使用
function safeRegexMatch($pattern, $subject) {
$result = preg_match($pattern, $subject, $matches);
if ($result === false) {
$error = preg_last_error();
$errorMessages = [
PREG_NO_ERROR => '没有错误',
PREG_INTERNAL_ERROR => '内部错误',
PREG_BACKTRACK_LIMIT_ERROR => '回溯限制错误',
PREG_RECURSION_LIMIT_ERROR => '递归限制错误',
PREG_BAD_UTF8_ERROR => 'UTF-8错误'
];
throw new RuntimeException('正则表达式错误: ' . ($errorMessages[$error] ?? '未知错误'));
}
return [$result, $matches ?? []];
}
// 验证用户输入
function validateUserInput($input, $type) {
$patterns = [
'username' => '/^[a-zA-Z0-9_]{3,20}$/',
'email' => '/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/',
'phone' => '/^1[3-9]\d{9}$/'
];
if (!isset($patterns[$type])) {
throw new InvalidArgumentException("不支持的验证类型: $type");
}
return preg_match($patterns[$type], $input) === 1;
}
// 使用示例
try {
list($result, $matches) = safeRegexMatch('/\d+/', 'test123');
if ($result) {
echo "找到数字: " . $matches[0] . "\n";
}
} catch (RuntimeException $e) {
echo "错误: " . $e->getMessage() . "\n";
}
?>性能优化技巧
php
<?php
// 1. 使用字符类而不是多选分支
// 慢:(a|b|c|d|e)
// 快:[a-e]
// 2. 避免不必要的回溯
function optimizedEmailValidation($email) {
// 使用原子分组 (?>...) 避免回溯
$pattern = '/^[a-zA-Z0-9]++(?:\.[a-zA-Z0-9]++)*+@[a-zA-Z0-9]++(?:\.[a-zA-Z0-9]++)*+$/';
return preg_match($pattern, $email);
}
// 3. 处理UTF-8字符
$text = "中文测试123";
// 错误:未指定u修饰符
$wrong = '/\w+/';
// 正确:使用u修饰符支持Unicode
$correct = '/\w+/u';
preg_match_all($wrong, $text, $matches1);
preg_match_all($correct, $text, $matches2);
echo "不使用u修饰符: " . implode(', ', $matches1[0]) . "\n";
echo "使用u修饰符: " . implode(', ', $matches2[0]) . "\n";
?>常见错误和解决方案
转义字符问题
php
<?php
// 错误:未正确转义
$wrong = '/\d+.\d+/'; // . 在正则中表示任意字符
// 正确:正确转义
$correct = '/\d+\.\d+/'; // \. 表示字面意义的点
$number = "3.14";
echo "错误模式: " . (preg_match($wrong, $number) ? "匹配" : "不匹配") . "\n";
echo "正确模式: " . (preg_match($correct, $number) ? "匹配" : "不匹配") . "\n";
?>贪婪匹配问题
php
<?php
// 问题:贪婪匹配导致意外结果
$html = '<div>content1</div><div>content2</div>';
$greedy = '/<div>.*<\/div>/';
$nonGreedy = '/<div>.*?<\/div>/';
preg_match($greedy, $html, $matches1);
preg_match($nonGreedy, $html, $matches2);
echo "贪婪匹配: " . $matches1[0] . "\n";
echo "非贪婪匹配: " . $matches2[0] . "\n";
?>总结
本章介绍了PHP中正则表达式的使用:
- 基础语法:元字符、字符类、量词
- 高级特性:分组、断言、前瞻后瞻
- 实际应用:数据验证、文本处理、日志解析
- 性能优化:避免回溯、使用适当的模式
- 错误处理:安全使用、异常处理
掌握正则表达式能够大大提高文本处理的效率和准确性。在下一章中,我们将学习PHP的标准库和内置函数。