第 12 章:扩展知识和进阶应用
2025/9/1大约 14 分钟
第 12 章:扩展知识和进阶应用
学习目标
- 了解Unicode和多语言文本处理
- 掌握递归正则表达式的概念
- 学习正则表达式的替换和回调
- 理解有限状态自动机原理
- 探索正则表达式的局限性和替代方案
12.1 Unicode和多语言文本处理
Unicode基础概念
// Unicode字符的不同表示方式
const examples = {
// 基本汉字
chinese: '你好世界',
// 带音调的字符
accented: 'café naïve résumé',
// emoji表情
emoji: '😀🎉🌟💖',
// 复合字符(家庭emoji)
composite: '👨👩👧👦',
// 不同语言混合
mixed: 'Hello 你好 Bonjour مرحبا'
};
// 查看字符的Unicode信息
function analyzeUnicode(text) {
return Array.from(text).map(char => ({
char: char,
codePoint: char.codePointAt(0),
hexCode: char.codePointAt(0).toString(16).toUpperCase().padStart(4, '0'),
category: getUnicodeCategory(char),
length: char.length
}));
}
function getUnicodeCategory(char) {
const code = char.codePointAt(0);
if (code >= 0x0030 && code <= 0x0039) return 'Digit';
if (code >= 0x0041 && code <= 0x005A) return 'Uppercase Letter';
if (code >= 0x0061 && code <= 0x007A) return 'Lowercase Letter';
if (code >= 0x4E00 && code <= 0x9FFF) return 'CJK Ideograph';
if (code >= 0x1F600 && code <= 0x1F64F) return 'Emoticons';
return 'Other';
}
console.log('Unicode分析:', analyzeUnicode('Hello 你好 😀'));
Unicode属性匹配
// Unicode属性模式(ES2018+)
class UnicodeTextProcessor {
constructor() {
this.patterns = {
// 匹配所有字母
letters: /\p{Letter}/u,
// 匹配数字
numbers: /\p{Number}/u,
// 匹配中文字符
chinese: /\p{Script=Han}/u,
// 匹配表情符号
emoji: /\p{Emoji}/u,
// 匹配阿拉伯文字
arabic: /\p{Script=Arabic}/u,
// 匹配标点符号
punctuation: /\p{Punctuation}/u,
// 匹配货币符号
currency: /\p{Currency_Symbol}/u,
// 匹配数学符号
math: /\p{Math_Symbol}/u,
// 匹配空白字符
whitespace: /\p{White_Space}/u,
// 匹配大写字母
uppercase: /\p{Uppercase_Letter}/u,
// 匹配小写字母
lowercase: /\p{Lowercase_Letter}/u
};
}
// 分析文本的语言构成
analyzeLanguages(text) {
const scripts = {};
const patterns = {
latin: /\p{Script=Latin}/u,
han: /\p{Script=Han}/u,
arabic: /\p{Script=Arabic}/u,
cyrillic: /\p{Script=Cyrillic}/u,
greek: /\p{Script=Greek}/u,
hebrew: /\p{Script=Hebrew}/u,
japanese: /[\p{Script=Hiragana}\p{Script=Katakana}]/u,
korean: /\p{Script=Hangul}/u,
thai: /\p{Script=Thai}/u,
devanagari: /\p{Script=Devanagari}/u
};
Object.entries(patterns).forEach(([script, pattern]) => {
const matches = text.match(new RegExp(pattern, 'gu'));
if (matches) {
scripts[script] = matches.length;
}
});
return scripts;
}
// 提取特定语言的文本
extractByScript(text, script) {
const patterns = {
chinese: /[\p{Script=Han}]+/gu,
english: /[\p{Script=Latin}\s]+/gu,
arabic: /[\p{Script=Arabic}\s]+/gu,
russian: /[\p{Script=Cyrillic}\s]+/gu,
japanese: /[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}]+/gu,
korean: /[\p{Script=Hangul}]+/gu
};
const pattern = patterns[script.toLowerCase()];
if (!pattern) return [];
return text.match(pattern) || [];
}
// 清理和规范化文本
normalizeText(text, options = {}) {
const {
removeEmoji = false,
removeAccents = false,
lowercaseOnly = false,
removeExtraSpaces = true,
normalizeForm = 'NFC' // NFC, NFD, NFKC, NFKD
} = options;
let normalized = text;
// Unicode规范化
normalized = normalized.normalize(normalizeForm);
// 移除表情符号
if (removeEmoji) {
normalized = normalized.replace(/\p{Emoji}/gu, '');
}
// 移除重音符号
if (removeAccents) {
normalized = normalized.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
}
// 转换为小写
if (lowercaseOnly) {
normalized = normalized.toLowerCase();
}
// 清理多余空格
if (removeExtraSpaces) {
normalized = normalized.replace(/\s+/g, ' ').trim();
}
return normalized;
}
// 智能分词(简化版)
tokenize(text, language = 'auto') {
if (language === 'auto') {
language = this.detectPrimaryLanguage(text);
}
switch (language) {
case 'chinese':
// 中文分词(简化,实际需要专门的分词库)
return this.chineseTokenize(text);
case 'english':
return this.englishTokenize(text);
case 'japanese':
return this.japaneseTokenize(text);
default:
return this.defaultTokenize(text);
}
}
detectPrimaryLanguage(text) {
const scripts = this.analyzeLanguages(text);
const total = Object.values(scripts).reduce((sum, count) => sum + count, 0);
let primary = 'english';
let maxRatio = 0;
Object.entries(scripts).forEach(([script, count]) => {
const ratio = count / total;
if (ratio > maxRatio) {
maxRatio = ratio;
primary = script === 'han' ? 'chinese' : script;
}
});
return primary;
}
chineseTokenize(text) {
// 简化的中文分词:按标点符号和空格分割
return text.split(/[\p{Punctuation}\p{White_Space}]+/u)
.filter(token => token.length > 0);
}
englishTokenize(text) {
// 英文分词:按空格和标点符号分割
return text.toLowerCase()
.split(/[^\p{Letter}\p{Number}]+/u)
.filter(token => token.length > 0);
}
japaneseTokenize(text) {
// 简化的日文分词
const tokens = [];
let currentToken = '';
Array.from(text).forEach(char => {
if (/[\p{Script=Hiragana}\p{Script=Katakana}]/u.test(char)) {
currentToken += char;
} else if (/\p{Script=Han}/u.test(char)) {
if (currentToken) {
tokens.push(currentToken);
currentToken = '';
}
tokens.push(char); // 简化处理,每个汉字作为一个token
} else {
if (currentToken) {
tokens.push(currentToken);
currentToken = '';
}
if (!/\p{White_Space}/u.test(char)) {
tokens.push(char);
}
}
});
if (currentToken) {
tokens.push(currentToken);
}
return tokens.filter(token => token.trim().length > 0);
}
defaultTokenize(text) {
return text.split(/\s+/).filter(token => token.length > 0);
}
}
// 使用示例
const processor = new UnicodeTextProcessor();
const multilingualText = "Hello 你好 مرحبا Здравствуйте こんにちは 안녕하세요 🌍";
console.log('语言分析:', processor.analyzeLanguages(multilingualText));
console.log('中文提取:', processor.extractByScript(multilingualText, 'chinese'));
console.log('英文提取:', processor.extractByScript(multilingualText, 'english'));
const normalizedText = processor.normalizeText("Café naïve résumé 🎉", {
removeAccents: true,
removeEmoji: true,
lowercaseOnly: true
});
console.log('规范化结果:', normalizedText);
const tokens = processor.tokenize("Hello world, 你好世界!");
console.log('分词结果:', tokens);
12.2 递归正则表达式
虽然JavaScript原生不支持递归正则表达式,但我们可以了解这个概念并用其他方式实现类似功能。
递归匹配的概念
// 递归正则表达式的概念(PCRE语法,JavaScript不支持)
// 匹配平衡的括号:\((?:[^()]|(?R))*\)
// 匹配嵌套的HTML标签:<(\w+)(?:[^<>]|(?R))*</\1>
// JavaScript中的替代实现
class RecursivePatternMatcher {
// 匹配平衡的括号
matchBalancedParentheses(text) {
const results = [];
let depth = 0;
let start = -1;
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (char === '(') {
if (depth === 0) {
start = i;
}
depth++;
} else if (char === ')') {
depth--;
if (depth === 0 && start !== -1) {
results.push({
match: text.substring(start, i + 1),
start: start,
end: i + 1
});
start = -1;
} else if (depth < 0) {
depth = 0; // 重置,处理不平衡的情况
}
}
}
return results;
}
// 匹配嵌套的JSON对象
matchNestedJson(text) {
const results = [];
let braceDepth = 0;
let bracketDepth = 0;
let start = -1;
let inString = false;
let escaped = false;
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (!inString) {
if (char === '{') {
if (braceDepth === 0 && bracketDepth === 0) {
start = i;
}
braceDepth++;
} else if (char === '}') {
braceDepth--;
if (braceDepth === 0 && bracketDepth === 0 && start !== -1) {
try {
const jsonStr = text.substring(start, i + 1);
JSON.parse(jsonStr); // 验证是否为有效JSON
results.push({
match: jsonStr,
start: start,
end: i + 1
});
} catch (e) {
// 不是有效JSON,忽略
}
start = -1;
}
} else if (char === '[') {
bracketDepth++;
} else if (char === ']') {
bracketDepth--;
} else if (char === '"' && !escaped) {
inString = true;
}
} else {
if (char === '"' && !escaped) {
inString = false;
} else if (char === '\\') {
escaped = !escaped;
continue;
}
}
escaped = false;
}
return results;
}
// 匹配嵌套的XML/HTML标签
matchNestedTags(text, tagName = null) {
const results = [];
const stack = [];
const tagPattern = /<(\/?)([\w-]+)(?:\s[^>]*)?>/g;
let match;
while ((match = tagPattern.exec(text)) !== null) {
const isClosing = match[1] === '/';
const currentTag = match[2].toLowerCase();
const fullMatch = match[0];
const position = match.index;
if (tagName && currentTag !== tagName.toLowerCase()) {
continue;
}
if (isClosing) {
// 寻找匹配的开始标签
for (let i = stack.length - 1; i >= 0; i--) {
if (stack[i].tag === currentTag) {
const startTag = stack[i];
const content = text.substring(startTag.end, position);
results.push({
tag: currentTag,
fullMatch: text.substring(startTag.start, position + fullMatch.length),
content: content,
start: startTag.start,
end: position + fullMatch.length,
depth: stack.length - 1
});
// 移除已匹配的标签
stack.splice(i);
break;
}
}
} else {
// 自闭合标签检查
if (!fullMatch.endsWith('/>')) {
stack.push({
tag: currentTag,
start: position,
end: position + fullMatch.length
});
}
}
}
return results;
}
// 通用递归匹配器
matchRecursivePattern(text, options) {
const {
open = '(',
close = ')',
escapeChar = '\\',
allowNested = true,
minDepth = 1,
maxDepth = Infinity
} = options;
const results = [];
let depth = 0;
let start = -1;
let escaped = false;
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (escaped) {
escaped = false;
continue;
}
if (char === escapeChar) {
escaped = true;
continue;
}
if (char === open) {
if (depth === 0) {
start = i;
}
depth++;
if (depth > maxDepth) {
depth = 0;
start = -1;
}
} else if (char === close && depth > 0) {
depth--;
if (depth === 0 && start !== -1) {
if (depth >= minDepth - 1) {
results.push({
match: text.substring(start, i + 1),
start: start,
end: i + 1,
content: text.substring(start + open.length, i)
});
}
start = -1;
}
}
}
return results;
}
}
// 使用示例
const matcher = new RecursivePatternMatcher();
// 测试平衡括号匹配
const parenthesesText = "函数调用: func(a, func2(b, c), d) 和 另一个(x, y)";
console.log('括号匹配:', matcher.matchBalancedParentheses(parenthesesText));
// 测试JSON匹配
const jsonText = '这里有JSON: {"name": "test", "nested": {"value": 123}} 和另一个 {"simple": true}';
console.log('JSON匹配:', matcher.matchNestedJson(jsonText));
// 测试HTML标签匹配
const htmlText = '<div>外层 <p>内层内容</p> 更多内容 <span>另一个内层</span></div>';
console.log('HTML匹配:', matcher.matchNestedTags(htmlText));
// 测试通用递归匹配
const customText = 'code{block{nested content}more}end';
const customResults = matcher.matchRecursivePattern(customText, {
open: '{',
close: '}',
minDepth: 1
});
console.log('自定义匹配:', customResults);
12.3 正则表达式的替换和回调
高级替换技术
class AdvancedReplacer {
constructor() {
this.templates = new Map();
this.transformers = new Map();
}
// 模板替换
replaceWithTemplate(text, pattern, template) {
return text.replace(pattern, (match, ...groups) => {
let result = template;
// 替换 $0, $1, $2 等占位符
result = result.replace(/\$(\d+)/g, (_, index) => {
const groupIndex = parseInt(index);
if (groupIndex === 0) return match;
return groups[groupIndex - 1] || '';
});
return result;
});
}
// 条件替换
conditionalReplace(text, rules) {
let result = text;
rules.forEach(rule => {
const { pattern, condition, replacement, otherwise } = rule;
result = result.replace(pattern, (match, ...groups) => {
const shouldReplace = typeof condition === 'function'
? condition(match, groups)
: condition;
if (shouldReplace) {
return typeof replacement === 'function'
? replacement(match, groups)
: replacement;
} else {
return otherwise || match;
}
});
});
return result;
}
// 计数替换
replaceWithCounter(text, pattern, callback) {
let count = 0;
return {
result: text.replace(pattern, (match, ...groups) => {
return callback(match, groups, count++);
}),
count: count
};
}
// 上下文感知替换
contextAwareReplace(text, pattern, callback, contextSize = 20) {
return text.replace(pattern, (match, ...groups) => {
const matchStart = arguments[arguments.length - 2];
const fullText = arguments[arguments.length - 1];
const contextStart = Math.max(0, matchStart - contextSize);
const contextEnd = Math.min(fullText.length, matchStart + match.length + contextSize);
const context = {
before: fullText.substring(contextStart, matchStart),
after: fullText.substring(matchStart + match.length, contextEnd),
full: fullText.substring(contextStart, contextEnd),
position: matchStart
};
return callback(match, groups, context);
});
}
// 链式替换
chainReplace(text, operations) {
return operations.reduce((result, operation) => {
const { pattern, replacement, options = {} } = operation;
if (typeof replacement === 'function') {
return result.replace(pattern, replacement);
} else if (typeof replacement === 'string') {
return result.replace(pattern, replacement);
} else if (replacement.template) {
return this.replaceWithTemplate(result, pattern, replacement.template);
}
return result;
}, text);
}
// 智能格式化
smartFormat(text, formatters) {
let result = text;
Object.entries(formatters).forEach(([type, formatter]) => {
switch (type) {
case 'phone':
result = this.formatPhoneNumbers(result, formatter);
break;
case 'date':
result = this.formatDates(result, formatter);
break;
case 'currency':
result = this.formatCurrency(result, formatter);
break;
case 'url':
result = this.formatUrls(result, formatter);
break;
case 'email':
result = this.formatEmails(result, formatter);
break;
}
});
return result;
}
formatPhoneNumbers(text, formatter) {
const phonePattern = /\b(\d{3})[-.\s]?(\d{3})[-.\s]?(\d{4})\b/g;
return text.replace(phonePattern, (match, area, exchange, number) => {
switch (formatter.format) {
case 'dots':
return `${area}.${exchange}.${number}`;
case 'dashes':
return `${area}-${exchange}-${number}`;
case 'parentheses':
return `(${area}) ${exchange}-${number}`;
case 'international':
return `+1-${area}-${exchange}-${number}`;
default:
return match;
}
});
}
formatDates(text, formatter) {
const datePatterns = [
/\b(\d{4})[-/](\d{1,2})[-/](\d{1,2})\b/g, // YYYY-MM-DD
/\b(\d{1,2})[-/](\d{1,2})[-/](\d{4})\b/g // MM/DD/YYYY
];
let result = text;
datePatterns.forEach(pattern => {
result = result.replace(pattern, (match, p1, p2, p3) => {
let year, month, day;
if (p1.length === 4) {
[year, month, day] = [p1, p2, p3];
} else {
[month, day, year] = [p1, p2, p3];
}
switch (formatter.format) {
case 'ISO':
return `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`;
case 'US':
return `${month}/${day}/${year}`;
case 'EU':
return `${day}/${month}/${year}`;
case 'readable':
const monthNames = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
return `${monthNames[parseInt(month) - 1]} ${day}, ${year}`;
default:
return match;
}
});
});
return result;
}
formatCurrency(text, formatter) {
const currencyPattern = /\$(\d+(?:\.\d{2})?)/g;
return text.replace(currencyPattern, (match, amount) => {
const num = parseFloat(amount);
switch (formatter.format) {
case 'commas':
return `$${num.toLocaleString()}`;
case 'words':
return this.numberToWords(num);
case 'cents':
return `${Math.round(num * 100)}¢`;
default:
return match;
}
});
}
formatUrls(text, formatter) {
const urlPattern = /(https?:\/\/[^\s]+)/g;
return text.replace(urlPattern, (match) => {
switch (formatter.format) {
case 'markdown':
return `[${match}](${match})`;
case 'html':
return `<a href="${match}" target="_blank">${match}</a>`;
case 'short':
const domain = match.replace(/https?:\/\//, '').split('/')[0];
return `${domain}...`;
default:
return match;
}
});
}
formatEmails(text, formatter) {
const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
return text.replace(emailPattern, (match) => {
switch (formatter.format) {
case 'mailto':
return `<a href="mailto:${match}">${match}</a>`;
case 'obfuscate':
return match.replace('@', ' [at] ').replace(/\./g, ' [dot] ');
case 'domain':
return match.split('@')[1];
case 'username':
return match.split('@')[0];
default:
return match;
}
});
}
numberToWords(num) {
// 简化的数字转文字实现
const ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'];
const teens = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'];
const tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'];
if (num < 10) return ones[num];
if (num < 20) return teens[num - 10];
if (num < 100) return tens[Math.floor(num / 10)] + (num % 10 ? ' ' + ones[num % 10] : '');
if (num < 1000) return ones[Math.floor(num / 100)] + ' hundred' + (num % 100 ? ' ' + this.numberToWords(num % 100) : '');
return num.toString(); // 简化处理
}
}
// 使用示例
const replacer = new AdvancedReplacer();
// 模板替换示例
const templateText = "用户John的邮箱是john@example.com";
const templateResult = replacer.replaceWithTemplate(
templateText,
/用户(\w+)的邮箱是([^\s]+)/,
"User: $1, Email: $2"
);
console.log('模板替换:', templateResult);
// 条件替换示例
const conditionalText = "价格: $10, $25, $100, $5";
const conditionalResult = replacer.conditionalReplace(conditionalText, [
{
pattern: /\$(\d+)/g,
condition: (match, groups) => parseInt(groups[0]) > 20,
replacement: (match, groups) => `💰${groups[0]}`,
otherwise: match
}
]);
console.log('条件替换:', conditionalResult);
// 计数替换示例
const counterText = "item item item item";
const counterResult = replacer.replaceWithCounter(
counterText,
/item/g,
(match, groups, count) => `item-${count + 1}`
);
console.log('计数替换:', counterResult);
// 智能格式化示例
const formatText = "电话: 1234567890, 日期: 2023-12-01, 价格: $1234.56, 网站: https://example.com";
const formattedResult = replacer.smartFormat(formatText, {
phone: { format: 'parentheses' },
date: { format: 'readable' },
currency: { format: 'commas' },
url: { format: 'markdown' }
});
console.log('智能格式化:', formattedResult);
12.4 正则表达式的局限性
不能用正则表达式解决的问题
// 正则表达式的局限性示例
class RegexLimitations {
// 1. 无法匹配嵌套结构(原生JavaScript)
// 例如:平衡的括号、嵌套的HTML标签
// 2. 无法进行算术运算
cannotDoMath() {
// 无法验证这样的表达式是否数学上正确
const mathExpression = "2 + 3 * 4 = 14";
// 只能验证格式,不能验证计算结果
const formatPattern = /^\d+\s*[+\-*/]\s*\d+(?:\s*[+\-*/]\s*\d+)*\s*=\s*\d+$/;
return {
formatValid: formatPattern.test(mathExpression),
note: "正则表达式无法验证数学计算的正确性"
};
}
// 3. 无法处理上下文相关的语法
cannotHandleContextSensitive() {
// 例如:编程语言中的变量声明和使用
const code = `
let x = 10;
console.log(x); // 使用已声明的变量
console.log(y); // 使用未声明的变量
`;
// 正则表达式无法确定变量是否已声明
const varDeclaration = /let\s+(\w+)/g;
const varUsage = /console\.log\((\w+)\)/g;
return {
declarations: [...code.matchAll(varDeclaration)].map(m => m[1]),
usages: [...code.matchAll(varUsage)].map(m => m[1]),
note: "正则表达式无法检查变量的作用域和声明状态"
};
}
// 4. 无法处理递归结构(原生)
cannotHandleRecursion() {
const nestedJson = '{"a": {"b": {"c": "value"}}}';
// 只能匹配固定层级的嵌套
const level1 = /^\{[^{}]*\}$/;
const level2 = /^\{[^{}]*\{[^{}]*\}[^{}]*\}$/;
return {
level1Match: level1.test('{"simple": "value"}'),
level2Match: level2.test('{"outer": {"inner": "value"}}'),
note: "原生正则表达式无法处理任意深度的嵌套结构"
};
}
// 5. 性能问题:灾难性回溯
demonstrateCatastrophicBacktracking() {
const problematicPattern = /(a+)+b/;
const testString = "a".repeat(20) + "x"; // 不以'b'结尾
console.time('catastrophic');
try {
problematicPattern.test(testString);
} catch (error) {
console.log('模式匹配可能超时或失败');
}
console.timeEnd('catastrophic');
return {
note: "某些正则表达式模式可能导致指数级时间复杂度"
};
}
// 替代解决方案
getAlternativeSolutions() {
return {
parsing: {
problem: "复杂语法解析",
solution: "使用专门的解析器生成器",
examples: ["ANTLR", "PEG.js", "近似算法"]
},
validation: {
problem: "复杂数据验证",
solution: "使用验证库和规则引擎",
examples: ["Joi", "Yup", "JSON Schema"]
},
textProcessing: {
problem: "高级文本处理",
solution: "使用专门的文本处理库",
examples: ["Natural Language Toolkit", "spaCy", "Stanford NLP"]
},
codeAnalysis: {
problem: "代码分析和重构",
solution: "使用抽象语法树(AST)工具",
examples: ["Babel", "ESLint", "TypeScript Compiler API"]
}
};
}
}
// 使用示例
const limitations = new RegexLimitations();
console.log('数学表达式限制:', limitations.cannotDoMath());
console.log('上下文相关限制:', limitations.cannotHandleContextSensitive());
console.log('递归结构限制:', limitations.cannotHandleRecursion());
console.log('替代解决方案:', limitations.getAlternativeSolutions());
limitations.demonstrateCatastrophicBacktracking();
何时不使用正则表达式
class WhenNotToUseRegex {
// 1. 解析复杂的结构化数据
parseComplexData() {
// ❌ 不好:用正则解析JSON
const badJsonParser = (jsonStr) => {
const stringPattern = /"([^"\\]|\\.)*"/g;
const numberPattern = /-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?/g;
// ... 这会变得非常复杂且不可靠
};
// ✅ 好:使用专门的解析器
const goodJsonParser = (jsonStr) => {
try {
return JSON.parse(jsonStr);
} catch (error) {
return null;
}
};
return { badJsonParser, goodJsonParser };
}
// 2. 复杂的业务逻辑验证
validateBusinessLogic() {
// ❌ 不好:用正则验证复杂的业务规则
const badPasswordValidator = (password) => {
// 试图用一个复杂的正则表达式验证所有规则
const complexRegex = /^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/;
return complexRegex.test(password);
};
// ✅ 好:使用逐步验证
const goodPasswordValidator = (password) => {
const rules = [
{ test: p => p.length >= 8, message: "至少8位字符" },
{ test: p => /[a-z]/.test(p), message: "包含小写字母" },
{ test: p => /[A-Z]/.test(p), message: "包含大写字母" },
{ test: p => /\d/.test(p), message: "包含数字" },
{ test: p => /[@$!%*?&]/.test(p), message: "包含特殊字符" },
{ test: p => !/(.)\1{2,}/.test(p), message: "不能有连续重复字符" }
];
const failures = rules.filter(rule => !rule.test(password));
return {
valid: failures.length === 0,
errors: failures.map(f => f.message)
};
};
return { badPasswordValidator, goodPasswordValidator };
}
// 3. 性能敏感的大文本处理
processLargeText() {
// ❌ 不好:对大文本使用复杂正则
const badLargeTextProcessor = (text) => {
// 复杂的正则可能导致性能问题
const complexPattern = /((\w+)\s+){2,}(\w+)/g;
return text.match(complexPattern);
};
// ✅ 好:使用优化的字符串操作
const goodLargeTextProcessor = (text) => {
const words = text.split(/\s+/);
const phrases = [];
for (let i = 0; i < words.length - 2; i++) {
if (words[i] && words[i + 1] && words[i + 2]) {
phrases.push(`${words[i]} ${words[i + 1]} ${words[i + 2]}`);
}
}
return phrases;
};
return { badLargeTextProcessor, goodLargeTextProcessor };
}
// 4. 需要详细错误信息的场景
provideDetailedErrors() {
// ❌ 不好:正则表达式只能说"匹配"或"不匹配"
const badEmailValidator = (email) => {
const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
return emailRegex.test(email);
};
// ✅ 好:提供详细的错误信息
const goodEmailValidator = (email) => {
const errors = [];
if (!email) {
errors.push("邮箱地址不能为空");
return { valid: false, errors };
}
if (email.length > 254) {
errors.push("邮箱地址过长");
}
if (!/\S+@\S+\.\S+/.test(email)) {
errors.push("邮箱格式不正确");
}
const parts = email.split('@');
if (parts.length !== 2) {
errors.push("邮箱必须包含且仅包含一个@符号");
} else {
const [local, domain] = parts;
if (local.length === 0) {
errors.push("邮箱用户名部分不能为空");
}
if (local.length > 64) {
errors.push("邮箱用户名部分过长");
}
if (domain.length === 0) {
errors.push("邮箱域名部分不能为空");
}
if (!/^[a-zA-Z0-9.-]+$/.test(domain)) {
errors.push("域名包含无效字符");
}
}
return {
valid: errors.length === 0,
errors
};
};
return { badEmailValidator, goodEmailValidator };
}
// 推荐的替代方案
getRecommendedAlternatives() {
return {
structuredDataParsing: [
"JSON.parse() for JSON",
"DOMParser for HTML/XML",
"CSV parsing libraries",
"YAML/TOML parsers"
],
complexValidation: [
"Joi validation library",
"Yup schema validation",
"Custom validation functions",
"Rule-based validation engines"
],
codeAnalysis: [
"AST parsers (Babel, TypeScript)",
"ESLint rules",
"Language servers",
"Static analysis tools"
],
textProcessing: [
"String methods for simple cases",
"Specialized NLP libraries",
"Tokenization libraries",
"Search engines (Elasticsearch, Solr)"
],
performanceCritical: [
"Native string methods",
"Streaming processors",
"Compiled regular expressions",
"Finite state machines"
]
};
}
}
const examples = new WhenNotToUseRegex();
console.log('正则表达式的替代方案:', examples.getRecommendedAlternatives());
小结
本章探讨了正则表达式的高级主题和扩展知识:
Unicode和多语言处理:
- Unicode属性匹配
- 多语言文本分析
- 文本规范化和分词
- 国际化文本处理
递归正则表达式:
- 递归匹配概念
- JavaScript中的替代实现
- 平衡结构匹配
- 嵌套内容处理
高级替换技术:
- 模板替换
- 条件替换
- 上下文感知替换
- 智能格式化
正则表达式的局限性:
- 无法处理的问题类型
- 性能陷阱
- 何时不使用正则表达式
- 替代解决方案
最佳实践建议:
- 选择合适的工具
- 平衡复杂性和可维护性
- 性能优化考虑
- 错误处理和用户体验
理解这些高级主题和局限性,可以帮助我们更明智地使用正则表达式,在合适的场景中发挥其最大价值,同时避免不必要的复杂性和性能问题。
课程总结
通过这12章的学习,我们全面掌握了正则表达式的:
- 基础语法和概念
- 字符匹配和模式构建
- 高级特性和技巧
- 实际应用和最佳实践
- 性能优化和替代方案
正则表达式是强大的文本处理工具,但要记住:工具的价值在于解决问题,而不是展示复杂性。选择最适合问题的工具,编写可读性强、性能良好的代码,这是专业开发者的标志。