第 12 章：扩展知识和进阶应用

Haiyue2025/9/1大约 14 分钟

第 12 章：扩展知识和进阶应用

学习目标

了解Unicode和多语言文本处理
掌握递归正则表达式的概念
学习正则表达式的替换和回调
理解有限状态自动机原理
探索正则表达式的局限性和替代方案

12.1 Unicode和多语言文本处理

Unicode基础概念

// Unicode字符的不同表示方式
const examples = {
    // 基本汉字
    chinese: '你好世界',
    
    // 带音调的字符
    accented: 'café naïve résumé',
    
    // emoji表情
    emoji: '😀🎉🌟💖',
    
    // 复合字符（家庭emoji）
    composite: '👨‍👩‍👧‍👦',
    
    // 不同语言混合
    mixed: 'Hello 你好 Bonjour مرحبا'
};

// 查看字符的Unicode信息
function analyzeUnicode(text) {
    return Array.from(text).map(char => ({
        char: char,
        codePoint: char.codePointAt(0),
        hexCode: char.codePointAt(0).toString(16).toUpperCase().padStart(4, '0'),
        category: getUnicodeCategory(char),
        length: char.length
    }));
}

function getUnicodeCategory(char) {
    const code = char.codePointAt(0);
    
    if (code >= 0x0030 && code <= 0x0039) return 'Digit';
    if (code >= 0x0041 && code <= 0x005A) return 'Uppercase Letter';
    if (code >= 0x0061 && code <= 0x007A) return 'Lowercase Letter';
    if (code >= 0x4E00 && code <= 0x9FFF) return 'CJK Ideograph';
    if (code >= 0x1F600 && code <= 0x1F64F) return 'Emoticons';
    
    return 'Other';
}

console.log('Unicode分析:', analyzeUnicode('Hello 你好 😀'));

Unicode属性匹配

// Unicode属性模式（ES2018+）
class UnicodeTextProcessor {
    constructor() {
        this.patterns = {
            // 匹配所有字母
            letters: /\p{Letter}/u,
            
            // 匹配数字
            numbers: /\p{Number}/u,
            
            // 匹配中文字符
            chinese: /\p{Script=Han}/u,
            
            // 匹配表情符号
            emoji: /\p{Emoji}/u,
            
            // 匹配阿拉伯文字
            arabic: /\p{Script=Arabic}/u,
            
            // 匹配标点符号
            punctuation: /\p{Punctuation}/u,
            
            // 匹配货币符号
            currency: /\p{Currency_Symbol}/u,
            
            // 匹配数学符号
            math: /\p{Math_Symbol}/u,
            
            // 匹配空白字符
            whitespace: /\p{White_Space}/u,
            
            // 匹配大写字母
            uppercase: /\p{Uppercase_Letter}/u,
            
            // 匹配小写字母
            lowercase: /\p{Lowercase_Letter}/u
        };
    }
    
    // 分析文本的语言构成
    analyzeLanguages(text) {
        const scripts = {};
        const patterns = {
            latin: /\p{Script=Latin}/u,
            han: /\p{Script=Han}/u,
            arabic: /\p{Script=Arabic}/u,
            cyrillic: /\p{Script=Cyrillic}/u,
            greek: /\p{Script=Greek}/u,
            hebrew: /\p{Script=Hebrew}/u,
            japanese: /[\p{Script=Hiragana}\p{Script=Katakana}]/u,
            korean: /\p{Script=Hangul}/u,
            thai: /\p{Script=Thai}/u,
            devanagari: /\p{Script=Devanagari}/u
        };
        
        Object.entries(patterns).forEach(([script, pattern]) => {
            const matches = text.match(new RegExp(pattern, 'gu'));
            if (matches) {
                scripts[script] = matches.length;
            }
        });
        
        return scripts;
    }
    
    // 提取特定语言的文本
    extractByScript(text, script) {
        const patterns = {
            chinese: /[\p{Script=Han}]+/gu,
            english: /[\p{Script=Latin}\s]+/gu,
            arabic: /[\p{Script=Arabic}\s]+/gu,
            russian: /[\p{Script=Cyrillic}\s]+/gu,
            japanese: /[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}]+/gu,
            korean: /[\p{Script=Hangul}]+/gu
        };
        
        const pattern = patterns[script.toLowerCase()];
        if (!pattern) return [];
        
        return text.match(pattern) || [];
    }
    
    // 清理和规范化文本
    normalizeText(text, options = {}) {
        const {
            removeEmoji = false,
            removeAccents = false,
            lowercaseOnly = false,
            removeExtraSpaces = true,
            normalizeForm = 'NFC' // NFC, NFD, NFKC, NFKD
        } = options;
        
        let normalized = text;
        
        // Unicode规范化
        normalized = normalized.normalize(normalizeForm);
        
        // 移除表情符号
        if (removeEmoji) {
            normalized = normalized.replace(/\p{Emoji}/gu, '');
        }
        
        // 移除重音符号
        if (removeAccents) {
            normalized = normalized.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
        }
        
        // 转换为小写
        if (lowercaseOnly) {
            normalized = normalized.toLowerCase();
        }
        
        // 清理多余空格
        if (removeExtraSpaces) {
            normalized = normalized.replace(/\s+/g, ' ').trim();
        }
        
        return normalized;
    }
    
    // 智能分词（简化版）
    tokenize(text, language = 'auto') {
        if (language === 'auto') {
            language = this.detectPrimaryLanguage(text);
        }
        
        switch (language) {
            case 'chinese':
                // 中文分词（简化，实际需要专门的分词库）
                return this.chineseTokenize(text);
            case 'english':
                return this.englishTokenize(text);
            case 'japanese':
                return this.japaneseTokenize(text);
            default:
                return this.defaultTokenize(text);
        }
    }
    
    detectPrimaryLanguage(text) {
        const scripts = this.analyzeLanguages(text);
        const total = Object.values(scripts).reduce((sum, count) => sum + count, 0);
        
        let primary = 'english';
        let maxRatio = 0;
        
        Object.entries(scripts).forEach(([script, count]) => {
            const ratio = count / total;
            if (ratio > maxRatio) {
                maxRatio = ratio;
                primary = script === 'han' ? 'chinese' : script;
            }
        });
        
        return primary;
    }
    
    chineseTokenize(text) {
        // 简化的中文分词：按标点符号和空格分割
        return text.split(/[\p{Punctuation}\p{White_Space}]+/u)
                  .filter(token => token.length > 0);
    }
    
    englishTokenize(text) {
        // 英文分词：按空格和标点符号分割
        return text.toLowerCase()
                  .split(/[^\p{Letter}\p{Number}]+/u)
                  .filter(token => token.length > 0);
    }
    
    japaneseTokenize(text) {
        // 简化的日文分词
        const tokens = [];
        let currentToken = '';
        
        Array.from(text).forEach(char => {
            if (/[\p{Script=Hiragana}\p{Script=Katakana}]/u.test(char)) {
                currentToken += char;
            } else if (/\p{Script=Han}/u.test(char)) {
                if (currentToken) {
                    tokens.push(currentToken);
                    currentToken = '';
                }
                tokens.push(char); // 简化处理，每个汉字作为一个token
            } else {
                if (currentToken) {
                    tokens.push(currentToken);
                    currentToken = '';
                }
                if (!/\p{White_Space}/u.test(char)) {
                    tokens.push(char);
                }
            }
        });
        
        if (currentToken) {
            tokens.push(currentToken);
        }
        
        return tokens.filter(token => token.trim().length > 0);
    }
    
    defaultTokenize(text) {
        return text.split(/\s+/).filter(token => token.length > 0);
    }
}

// 使用示例
const processor = new UnicodeTextProcessor();

const multilingualText = "Hello 你好 مرحبا Здравствуйте こんにちは 안녕하세요 🌍";

console.log('语言分析:', processor.analyzeLanguages(multilingualText));
console.log('中文提取:', processor.extractByScript(multilingualText, 'chinese'));
console.log('英文提取:', processor.extractByScript(multilingualText, 'english'));

const normalizedText = processor.normalizeText("Café naïve résumé 🎉", {
    removeAccents: true,
    removeEmoji: true,
    lowercaseOnly: true
});
console.log('规范化结果:', normalizedText);

const tokens = processor.tokenize("Hello world, 你好世界！");
console.log('分词结果:', tokens);

12.2 递归正则表达式

虽然JavaScript原生不支持递归正则表达式，但我们可以了解这个概念并用其他方式实现类似功能。

递归匹配的概念

// 递归正则表达式的概念（PCRE语法，JavaScript不支持）
// 匹配平衡的括号：\((?:[^()]|(?R))*\)
// 匹配嵌套的HTML标签：<(\w+)(?:[^<>]|(?R))*</\1>

// JavaScript中的替代实现
class RecursivePatternMatcher {
    // 匹配平衡的括号
    matchBalancedParentheses(text) {
        const results = [];
        let depth = 0;
        let start = -1;
        
        for (let i = 0; i < text.length; i++) {
            const char = text[i];
            
            if (char === '(') {
                if (depth === 0) {
                    start = i;
                }
                depth++;
            } else if (char === ')') {
                depth--;
                if (depth === 0 && start !== -1) {
                    results.push({
                        match: text.substring(start, i + 1),
                        start: start,
                        end: i + 1
                    });
                    start = -1;
                } else if (depth < 0) {
                    depth = 0; // 重置，处理不平衡的情况
                }
            }
        }
        
        return results;
    }
    
    // 匹配嵌套的JSON对象
    matchNestedJson(text) {
        const results = [];
        let braceDepth = 0;
        let bracketDepth = 0;
        let start = -1;
        let inString = false;
        let escaped = false;
        
        for (let i = 0; i < text.length; i++) {
            const char = text[i];
            
            if (!inString) {
                if (char === '{') {
                    if (braceDepth === 0 && bracketDepth === 0) {
                        start = i;
                    }
                    braceDepth++;
                } else if (char === '}') {
                    braceDepth--;
                    if (braceDepth === 0 && bracketDepth === 0 && start !== -1) {
                        try {
                            const jsonStr = text.substring(start, i + 1);
                            JSON.parse(jsonStr); // 验证是否为有效JSON
                            results.push({
                                match: jsonStr,
                                start: start,
                                end: i + 1
                            });
                        } catch (e) {
                            // 不是有效JSON，忽略
                        }
                        start = -1;
                    }
                } else if (char === '[') {
                    bracketDepth++;
                } else if (char === ']') {
                    bracketDepth--;
                } else if (char === '"' && !escaped) {
                    inString = true;
                }
            } else {
                if (char === '"' && !escaped) {
                    inString = false;
                } else if (char === '\\') {
                    escaped = !escaped;
                    continue;
                }
            }
            
            escaped = false;
        }
        
        return results;
    }
    
    // 匹配嵌套的XML/HTML标签
    matchNestedTags(text, tagName = null) {
        const results = [];
        const stack = [];
        const tagPattern = /<(\/?)([\w-]+)(?:\s[^>]*)?>/g;
        let match;
        
        while ((match = tagPattern.exec(text)) !== null) {
            const isClosing = match[1] === '/';
            const currentTag = match[2].toLowerCase();
            const fullMatch = match[0];
            const position = match.index;
            
            if (tagName && currentTag !== tagName.toLowerCase()) {
                continue;
            }
            
            if (isClosing) {
                // 寻找匹配的开始标签
                for (let i = stack.length - 1; i >= 0; i--) {
                    if (stack[i].tag === currentTag) {
                        const startTag = stack[i];
                        const content = text.substring(startTag.end, position);
                        
                        results.push({
                            tag: currentTag,
                            fullMatch: text.substring(startTag.start, position + fullMatch.length),
                            content: content,
                            start: startTag.start,
                            end: position + fullMatch.length,
                            depth: stack.length - 1
                        });
                        
                        // 移除已匹配的标签
                        stack.splice(i);
                        break;
                    }
                }
            } else {
                // 自闭合标签检查
                if (!fullMatch.endsWith('/>')) {
                    stack.push({
                        tag: currentTag,
                        start: position,
                        end: position + fullMatch.length
                    });
                }
            }
        }
        
        return results;
    }
    
    // 通用递归匹配器
    matchRecursivePattern(text, options) {
        const {
            open = '(',
            close = ')',
            escapeChar = '\\',
            allowNested = true,
            minDepth = 1,
            maxDepth = Infinity
        } = options;
        
        const results = [];
        let depth = 0;
        let start = -1;
        let escaped = false;
        
        for (let i = 0; i < text.length; i++) {
            const char = text[i];
            
            if (escaped) {
                escaped = false;
                continue;
            }
            
            if (char === escapeChar) {
                escaped = true;
                continue;
            }
            
            if (char === open) {
                if (depth === 0) {
                    start = i;
                }
                depth++;
                
                if (depth > maxDepth) {
                    depth = 0;
                    start = -1;
                }
            } else if (char === close && depth > 0) {
                depth--;
                
                if (depth === 0 && start !== -1) {
                    if (depth >= minDepth - 1) {
                        results.push({
                            match: text.substring(start, i + 1),
                            start: start,
                            end: i + 1,
                            content: text.substring(start + open.length, i)
                        });
                    }
                    start = -1;
                }
            }
        }
        
        return results;
    }
}

// 使用示例
const matcher = new RecursivePatternMatcher();

// 测试平衡括号匹配
const parenthesesText = "函数调用: func(a, func2(b, c), d) 和 另一个(x, y)";
console.log('括号匹配:', matcher.matchBalancedParentheses(parenthesesText));

// 测试JSON匹配
const jsonText = '这里有JSON: {"name": "test", "nested": {"value": 123}} 和另一个 {"simple": true}';
console.log('JSON匹配:', matcher.matchNestedJson(jsonText));

// 测试HTML标签匹配
const htmlText = '<div>外层 <p>内层内容</p> 更多内容 <span>另一个内层</span></div>';
console.log('HTML匹配:', matcher.matchNestedTags(htmlText));

// 测试通用递归匹配
const customText = 'code{block{nested content}more}end';
const customResults = matcher.matchRecursivePattern(customText, {
    open: '{',
    close: '}',
    minDepth: 1
});
console.log('自定义匹配:', customResults);

12.3 正则表达式的替换和回调

高级替换技术

class AdvancedReplacer {
    constructor() {
        this.templates = new Map();
        this.transformers = new Map();
    }
    
    // 模板替换
    replaceWithTemplate(text, pattern, template) {
        return text.replace(pattern, (match, ...groups) => {
            let result = template;
            
            // 替换 $0, $1, $2 等占位符
            result = result.replace(/\$(\d+)/g, (_, index) => {
                const groupIndex = parseInt(index);
                if (groupIndex === 0) return match;
                return groups[groupIndex - 1] || '';
            });
            
            return result;
        });
    }
    
    // 条件替换
    conditionalReplace(text, rules) {
        let result = text;
        
        rules.forEach(rule => {
            const { pattern, condition, replacement, otherwise } = rule;
            
            result = result.replace(pattern, (match, ...groups) => {
                const shouldReplace = typeof condition === 'function' 
                    ? condition(match, groups)
                    : condition;
                
                if (shouldReplace) {
                    return typeof replacement === 'function'
                        ? replacement(match, groups)
                        : replacement;
                } else {
                    return otherwise || match;
                }
            });
        });
        
        return result;
    }
    
    // 计数替换
    replaceWithCounter(text, pattern, callback) {
        let count = 0;
        
        return {
            result: text.replace(pattern, (match, ...groups) => {
                return callback(match, groups, count++);
            }),
            count: count
        };
    }
    
    // 上下文感知替换
    contextAwareReplace(text, pattern, callback, contextSize = 20) {
        return text.replace(pattern, (match, ...groups) => {
            const matchStart = arguments[arguments.length - 2];
            const fullText = arguments[arguments.length - 1];
            
            const contextStart = Math.max(0, matchStart - contextSize);
            const contextEnd = Math.min(fullText.length, matchStart + match.length + contextSize);
            
            const context = {
                before: fullText.substring(contextStart, matchStart),
                after: fullText.substring(matchStart + match.length, contextEnd),
                full: fullText.substring(contextStart, contextEnd),
                position: matchStart
            };
            
            return callback(match, groups, context);
        });
    }
    
    // 链式替换
    chainReplace(text, operations) {
        return operations.reduce((result, operation) => {
            const { pattern, replacement, options = {} } = operation;
            
            if (typeof replacement === 'function') {
                return result.replace(pattern, replacement);
            } else if (typeof replacement === 'string') {
                return result.replace(pattern, replacement);
            } else if (replacement.template) {
                return this.replaceWithTemplate(result, pattern, replacement.template);
            }
            
            return result;
        }, text);
    }
    
    // 智能格式化
    smartFormat(text, formatters) {
        let result = text;
        
        Object.entries(formatters).forEach(([type, formatter]) => {
            switch (type) {
                case 'phone':
                    result = this.formatPhoneNumbers(result, formatter);
                    break;
                case 'date':
                    result = this.formatDates(result, formatter);
                    break;
                case 'currency':
                    result = this.formatCurrency(result, formatter);
                    break;
                case 'url':
                    result = this.formatUrls(result, formatter);
                    break;
                case 'email':
                    result = this.formatEmails(result, formatter);
                    break;
            }
        });
        
        return result;
    }
    
    formatPhoneNumbers(text, formatter) {
        const phonePattern = /\b(\d{3})[-.\s]?(\d{3})[-.\s]?(\d{4})\b/g;
        
        return text.replace(phonePattern, (match, area, exchange, number) => {
            switch (formatter.format) {
                case 'dots':
                    return `${area}.${exchange}.${number}`;
                case 'dashes':
                    return `${area}-${exchange}-${number}`;
                case 'parentheses':
                    return `(${area}) ${exchange}-${number}`;
                case 'international':
                    return `+1-${area}-${exchange}-${number}`;
                default:
                    return match;
            }
        });
    }
    
    formatDates(text, formatter) {
        const datePatterns = [
            /\b(\d{4})[-/](\d{1,2})[-/](\d{1,2})\b/g,  // YYYY-MM-DD
            /\b(\d{1,2})[-/](\d{1,2})[-/](\d{4})\b/g   // MM/DD/YYYY
        ];
        
        let result = text;
        
        datePatterns.forEach(pattern => {
            result = result.replace(pattern, (match, p1, p2, p3) => {
                let year, month, day;
                
                if (p1.length === 4) {
                    [year, month, day] = [p1, p2, p3];
                } else {
                    [month, day, year] = [p1, p2, p3];
                }
                
                switch (formatter.format) {
                    case 'ISO':
                        return `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`;
                    case 'US':
                        return `${month}/${day}/${year}`;
                    case 'EU':
                        return `${day}/${month}/${year}`;
                    case 'readable':
                        const monthNames = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                                          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
                        return `${monthNames[parseInt(month) - 1]} ${day}, ${year}`;
                    default:
                        return match;
                }
            });
        });
        
        return result;
    }
    
    formatCurrency(text, formatter) {
        const currencyPattern = /\$(\d+(?:\.\d{2})?)/g;
        
        return text.replace(currencyPattern, (match, amount) => {
            const num = parseFloat(amount);
            
            switch (formatter.format) {
                case 'commas':
                    return `$${num.toLocaleString()}`;
                case 'words':
                    return this.numberToWords(num);
                case 'cents':
                    return `${Math.round(num * 100)}¢`;
                default:
                    return match;
            }
        });
    }
    
    formatUrls(text, formatter) {
        const urlPattern = /(https?:\/\/[^\s]+)/g;
        
        return text.replace(urlPattern, (match) => {
            switch (formatter.format) {
                case 'markdown':
                    return `[${match}](${match})`;
                case 'html':
                    return `<a href="${match}" target="_blank">${match}</a>`;
                case 'short':
                    const domain = match.replace(/https?:\/\//, '').split('/')[0];
                    return `${domain}...`;
                default:
                    return match;
            }
        });
    }
    
    formatEmails(text, formatter) {
        const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
        
        return text.replace(emailPattern, (match) => {
            switch (formatter.format) {
                case 'mailto':
                    return `<a href="mailto:${match}">${match}</a>`;
                case 'obfuscate':
                    return match.replace('@', ' [at] ').replace(/\./g, ' [dot] ');
                case 'domain':
                    return match.split('@')[1];
                case 'username':
                    return match.split('@')[0];
                default:
                    return match;
            }
        });
    }
    
    numberToWords(num) {
        // 简化的数字转文字实现
        const ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'];
        const teens = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'];
        const tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'];
        
        if (num < 10) return ones[num];
        if (num < 20) return teens[num - 10];
        if (num < 100) return tens[Math.floor(num / 10)] + (num % 10 ? ' ' + ones[num % 10] : '');
        if (num < 1000) return ones[Math.floor(num / 100)] + ' hundred' + (num % 100 ? ' ' + this.numberToWords(num % 100) : '');
        
        return num.toString(); // 简化处理
    }
}

// 使用示例
const replacer = new AdvancedReplacer();

// 模板替换示例
const templateText = "用户John的邮箱是john@example.com";
const templateResult = replacer.replaceWithTemplate(
    templateText,
    /用户(\w+)的邮箱是([^\s]+)/,
    "User: $1, Email: $2"
);
console.log('模板替换:', templateResult);

// 条件替换示例
const conditionalText = "价格: $10, $25, $100, $5";
const conditionalResult = replacer.conditionalReplace(conditionalText, [
    {
        pattern: /\$(\d+)/g,
        condition: (match, groups) => parseInt(groups[0]) > 20,
        replacement: (match, groups) => `💰${groups[0]}`,
        otherwise: match
    }
]);
console.log('条件替换:', conditionalResult);

// 计数替换示例
const counterText = "item item item item";
const counterResult = replacer.replaceWithCounter(
    counterText,
    /item/g,
    (match, groups, count) => `item-${count + 1}`
);
console.log('计数替换:', counterResult);

// 智能格式化示例
const formatText = "电话: 1234567890, 日期: 2023-12-01, 价格: $1234.56, 网站: https://example.com";
const formattedResult = replacer.smartFormat(formatText, {
    phone: { format: 'parentheses' },
    date: { format: 'readable' },
    currency: { format: 'commas' },
    url: { format: 'markdown' }
});
console.log('智能格式化:', formattedResult);

12.4 正则表达式的局限性

不能用正则表达式解决的问题

// 正则表达式的局限性示例
class RegexLimitations {
    // 1. 无法匹配嵌套结构（原生JavaScript）
    // 例如：平衡的括号、嵌套的HTML标签
    
    // 2. 无法进行算术运算
    cannotDoMath() {
        // 无法验证这样的表达式是否数学上正确
        const mathExpression = "2 + 3 * 4 = 14";
        
        // 只能验证格式，不能验证计算结果
        const formatPattern = /^\d+\s*[+\-*/]\s*\d+(?:\s*[+\-*/]\s*\d+)*\s*=\s*\d+$/;
        
        return {
            formatValid: formatPattern.test(mathExpression),
            note: "正则表达式无法验证数学计算的正确性"
        };
    }
    
    // 3. 无法处理上下文相关的语法
    cannotHandleContextSensitive() {
        // 例如：编程语言中的变量声明和使用
        const code = `
            let x = 10;
            console.log(x); // 使用已声明的变量
            console.log(y); // 使用未声明的变量
        `;
        
        // 正则表达式无法确定变量是否已声明
        const varDeclaration = /let\s+(\w+)/g;
        const varUsage = /console\.log\((\w+)\)/g;
        
        return {
            declarations: [...code.matchAll(varDeclaration)].map(m => m[1]),
            usages: [...code.matchAll(varUsage)].map(m => m[1]),
            note: "正则表达式无法检查变量的作用域和声明状态"
        };
    }
    
    // 4. 无法处理递归结构（原生）
    cannotHandleRecursion() {
        const nestedJson = '{"a": {"b": {"c": "value"}}}';
        
        // 只能匹配固定层级的嵌套
        const level1 = /^\{[^{}]*\}$/;
        const level2 = /^\{[^{}]*\{[^{}]*\}[^{}]*\}$/;
        
        return {
            level1Match: level1.test('{"simple": "value"}'),
            level2Match: level2.test('{"outer": {"inner": "value"}}'),
            note: "原生正则表达式无法处理任意深度的嵌套结构"
        };
    }
    
    // 5. 性能问题：灾难性回溯
    demonstrateCatastrophicBacktracking() {
        const problematicPattern = /(a+)+b/;
        const testString = "a".repeat(20) + "x"; // 不以'b'结尾
        
        console.time('catastrophic');
        try {
            problematicPattern.test(testString);
        } catch (error) {
            console.log('模式匹配可能超时或失败');
        }
        console.timeEnd('catastrophic');
        
        return {
            note: "某些正则表达式模式可能导致指数级时间复杂度"
        };
    }
    
    // 替代解决方案
    getAlternativeSolutions() {
        return {
            parsing: {
                problem: "复杂语法解析",
                solution: "使用专门的解析器生成器",
                examples: ["ANTLR", "PEG.js", "近似算法"]
            },
            
            validation: {
                problem: "复杂数据验证",
                solution: "使用验证库和规则引擎",
                examples: ["Joi", "Yup", "JSON Schema"]
            },
            
            textProcessing: {
                problem: "高级文本处理",
                solution: "使用专门的文本处理库",
                examples: ["Natural Language Toolkit", "spaCy", "Stanford NLP"]
            },
            
            codeAnalysis: {
                problem: "代码分析和重构",
                solution: "使用抽象语法树(AST)工具",
                examples: ["Babel", "ESLint", "TypeScript Compiler API"]
            }
        };
    }
}

// 使用示例
const limitations = new RegexLimitations();

console.log('数学表达式限制:', limitations.cannotDoMath());
console.log('上下文相关限制:', limitations.cannotHandleContextSensitive());
console.log('递归结构限制:', limitations.cannotHandleRecursion());
console.log('替代解决方案:', limitations.getAlternativeSolutions());

limitations.demonstrateCatastrophicBacktracking();

何时不使用正则表达式

class WhenNotToUseRegex {
    // 1. 解析复杂的结构化数据
    parseComplexData() {
        // ❌ 不好：用正则解析JSON
        const badJsonParser = (jsonStr) => {
            const stringPattern = /"([^"\\]|\\.)*"/g;
            const numberPattern = /-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?/g;
            // ... 这会变得非常复杂且不可靠
        };
        
        // ✅ 好：使用专门的解析器
        const goodJsonParser = (jsonStr) => {
            try {
                return JSON.parse(jsonStr);
            } catch (error) {
                return null;
            }
        };
        
        return { badJsonParser, goodJsonParser };
    }
    
    // 2. 复杂的业务逻辑验证
    validateBusinessLogic() {
        // ❌ 不好：用正则验证复杂的业务规则
        const badPasswordValidator = (password) => {
            // 试图用一个复杂的正则表达式验证所有规则
            const complexRegex = /^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/;
            return complexRegex.test(password);
        };
        
        // ✅ 好：使用逐步验证
        const goodPasswordValidator = (password) => {
            const rules = [
                { test: p => p.length >= 8, message: "至少8位字符" },
                { test: p => /[a-z]/.test(p), message: "包含小写字母" },
                { test: p => /[A-Z]/.test(p), message: "包含大写字母" },
                { test: p => /\d/.test(p), message: "包含数字" },
                { test: p => /[@$!%*?&]/.test(p), message: "包含特殊字符" },
                { test: p => !/(.)\1{2,}/.test(p), message: "不能有连续重复字符" }
            ];
            
            const failures = rules.filter(rule => !rule.test(password));
            
            return {
                valid: failures.length === 0,
                errors: failures.map(f => f.message)
            };
        };
        
        return { badPasswordValidator, goodPasswordValidator };
    }
    
    // 3. 性能敏感的大文本处理
    processLargeText() {
        // ❌ 不好：对大文本使用复杂正则
        const badLargeTextProcessor = (text) => {
            // 复杂的正则可能导致性能问题
            const complexPattern = /((\w+)\s+){2,}(\w+)/g;
            return text.match(complexPattern);
        };
        
        // ✅ 好：使用优化的字符串操作
        const goodLargeTextProcessor = (text) => {
            const words = text.split(/\s+/);
            const phrases = [];
            
            for (let i = 0; i < words.length - 2; i++) {
                if (words[i] && words[i + 1] && words[i + 2]) {
                    phrases.push(`${words[i]} ${words[i + 1]} ${words[i + 2]}`);
                }
            }
            
            return phrases;
        };
        
        return { badLargeTextProcessor, goodLargeTextProcessor };
    }
    
    // 4. 需要详细错误信息的场景
    provideDetailedErrors() {
        // ❌ 不好：正则表达式只能说"匹配"或"不匹配"
        const badEmailValidator = (email) => {
            const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
            return emailRegex.test(email);
        };
        
        // ✅ 好：提供详细的错误信息
        const goodEmailValidator = (email) => {
            const errors = [];
            
            if (!email) {
                errors.push("邮箱地址不能为空");
                return { valid: false, errors };
            }
            
            if (email.length > 254) {
                errors.push("邮箱地址过长");
            }
            
            if (!/\S+@\S+\.\S+/.test(email)) {
                errors.push("邮箱格式不正确");
            }
            
            const parts = email.split('@');
            if (parts.length !== 2) {
                errors.push("邮箱必须包含且仅包含一个@符号");
            } else {
                const [local, domain] = parts;
                
                if (local.length === 0) {
                    errors.push("邮箱用户名部分不能为空");
                }
                
                if (local.length > 64) {
                    errors.push("邮箱用户名部分过长");
                }
                
                if (domain.length === 0) {
                    errors.push("邮箱域名部分不能为空");
                }
                
                if (!/^[a-zA-Z0-9.-]+$/.test(domain)) {
                    errors.push("域名包含无效字符");
                }
            }
            
            return {
                valid: errors.length === 0,
                errors
            };
        };
        
        return { badEmailValidator, goodEmailValidator };
    }
    
    // 推荐的替代方案
    getRecommendedAlternatives() {
        return {
            structuredDataParsing: [
                "JSON.parse() for JSON",
                "DOMParser for HTML/XML", 
                "CSV parsing libraries",
                "YAML/TOML parsers"
            ],
            
            complexValidation: [
                "Joi validation library",
                "Yup schema validation",
                "Custom validation functions",
                "Rule-based validation engines"
            ],
            
            codeAnalysis: [
                "AST parsers (Babel, TypeScript)",
                "ESLint rules",
                "Language servers",
                "Static analysis tools"
            ],
            
            textProcessing: [
                "String methods for simple cases",
                "Specialized NLP libraries",
                "Tokenization libraries",
                "Search engines (Elasticsearch, Solr)"
            ],
            
            performanceCritical: [
                "Native string methods",
                "Streaming processors",
                "Compiled regular expressions",
                "Finite state machines"
            ]
        };
    }
}

const examples = new WhenNotToUseRegex();
console.log('正则表达式的替代方案:', examples.getRecommendedAlternatives());

小结

本章探讨了正则表达式的高级主题和扩展知识：

Unicode和多语言处理：
- Unicode属性匹配
- 多语言文本分析
- 文本规范化和分词
- 国际化文本处理
递归正则表达式：
- 递归匹配概念
- JavaScript中的替代实现
- 平衡结构匹配
- 嵌套内容处理
高级替换技术：
- 模板替换
- 条件替换
- 上下文感知替换
- 智能格式化
正则表达式的局限性：
- 无法处理的问题类型
- 性能陷阱
- 何时不使用正则表达式
- 替代解决方案
最佳实践建议：
- 选择合适的工具
- 平衡复杂性和可维护性
- 性能优化考虑
- 错误处理和用户体验

理解这些高级主题和局限性，可以帮助我们更明智地使用正则表达式，在合适的场景中发挥其最大价值，同时避免不必要的复杂性和性能问题。

课程总结

通过这12章的学习，我们全面掌握了正则表达式的：

基础语法和概念
字符匹配和模式构建
高级特性和技巧
实际应用和最佳实践
性能优化和替代方案

正则表达式是强大的文本处理工具，但要记住：工具的价值在于解决问题，而不是展示复杂性。选择最适合问题的工具，编写可读性强、性能良好的代码，这是专业开发者的标志。