第 11 章：文本处理综合实战

Haiyue2025/9/1大约 11 分钟

第 11 章：文本处理综合实战

学习目标

实现日志文件分析工具
创建代码格式检查器
开发文本内容提取工具
设计数据清洗和格式转换程序
构建简单的模板引擎

11.1 日志文件分析工具

日志解析器

class LogAnalyzer {
    constructor() {
        // 支持多种日志格式
        this.patterns = {
            // Apache访问日志
            apache: /^(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+|-) "([^"]*)" "([^"]*)"/,
            
            // 标准应用日志: 2023-12-01 10:30:45 [INFO] Message
            standard: /^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)$/,
            
            // Syslog格式
            syslog: /^(\w{3} \d{2} \d{2}:\d{2}:\d{2}) (\S+) (\w+)(?:\[(\d+)\])?: (.+)$/,
            
            // JSON格式
            json: /^\{.*\}$/
        };
        
        this.stats = {
            total: 0,
            byLevel: {},
            byHour: {},
            errors: []
        };
    }
    
    detectFormat(logLine) {
        for (const [format, pattern] of Object.entries(this.patterns)) {
            if (pattern.test(logLine)) {
                return format;
            }
        }
        return 'unknown';
    }
    
    parseLine(logLine, format = null) {
        if (!format) {
            format = this.detectFormat(logLine);
        }
        
        switch (format) {
            case 'apache':
                return this.parseApacheLog(logLine);
            case 'standard':
                return this.parseStandardLog(logLine);
            case 'syslog':
                return this.parseSyslog(logLine);
            case 'json':
                return this.parseJsonLog(logLine);
            default:
                return { format: 'unknown', raw: logLine };
        }
    }
    
    parseApacheLog(logLine) {
        const match = this.patterns.apache.exec(logLine);
        if (!match) return null;
        
        return {
            format: 'apache',
            ip: match[1],
            timestamp: match[2],
            method: match[3],
            url: match[4],
            protocol: match[5],
            status: parseInt(match[6]),
            size: match[7] === '-' ? 0 : parseInt(match[7]),
            referer: match[8],
            userAgent: match[9]
        };
    }
    
    parseStandardLog(logLine) {
        const match = this.patterns.standard.exec(logLine);
        if (!match) return null;
        
        return {
            format: 'standard',
            timestamp: match[1],
            level: match[2],
            message: match[3]
        };
    }
    
    parseSyslog(logLine) {
        const match = this.patterns.syslog.exec(logLine);
        if (!match) return null;
        
        return {
            format: 'syslog',
            timestamp: match[1],
            host: match[2],
            process: match[3],
            pid: match[4] || null,
            message: match[5]
        };
    }
    
    parseJsonLog(logLine) {
        try {
            const data = JSON.parse(logLine);
            return {
                format: 'json',
                ...data
            };
        } catch (e) {
            return null;
        }
    }
    
    analyzeFile(logContent) {
        const lines = logContent.split('\n').filter(line => line.trim());
        const results = [];
        
        this.stats = { total: 0, byLevel: {}, byHour: {}, errors: [] };
        
        lines.forEach((line, index) => {
            try {
                const parsed = this.parseLine(line);
                if (parsed) {
                    results.push(parsed);
                    this.updateStats(parsed, index);
                }
            } catch (error) {
                this.stats.errors.push({ line: index + 1, error: error.message });
            }
        });
        
        this.stats.total = results.length;
        return {
            entries: results,
            stats: this.stats
        };
    }
    
    updateStats(entry, lineIndex) {
        // 按日志级别统计
        if (entry.level) {
            this.stats.byLevel[entry.level] = (this.stats.byLevel[entry.level] || 0) + 1;
        }
        
        // 按小时统计
        if (entry.timestamp) {
            const hour = entry.timestamp.substring(11, 13);
            this.stats.byHour[hour] = (this.stats.byHour[hour] || 0) + 1;
        }
    }
    
    // 查询功能
    filter(entries, criteria) {
        return entries.filter(entry => {
            if (criteria.level && entry.level !== criteria.level) {
                return false;
            }
            
            if (criteria.timeRange) {
                const entryTime = new Date(entry.timestamp);
                if (entryTime < criteria.timeRange.start || entryTime > criteria.timeRange.end) {
                    return false;
                }
            }
            
            if (criteria.pattern) {
                const regex = new RegExp(criteria.pattern, 'i');
                if (!regex.test(entry.message || entry.raw || '')) {
                    return false;
                }
            }
            
            return true;
        });
    }
    
    // 生成报告
    generateReport(analysis) {
        const { entries, stats } = analysis;
        
        return {
            summary: {
                totalEntries: stats.total,
                errorLines: stats.errors.length,
                timeSpan: this.getTimeSpan(entries),
                formats: this.getFormatDistribution(entries)
            },
            levelDistribution: stats.byLevel,
            hourlyDistribution: stats.byHour,
            topErrors: this.getTopErrors(entries),
            recommendations: this.getRecommendations(stats, entries)
        };
    }
    
    getTimeSpan(entries) {
        const timestamps = entries
            .map(e => e.timestamp)
            .filter(t => t)
            .map(t => new Date(t))
            .sort();
        
        if (timestamps.length === 0) return null;
        
        return {
            start: timestamps[0],
            end: timestamps[timestamps.length - 1],
            duration: timestamps[timestamps.length - 1] - timestamps[0]
        };
    }
    
    getFormatDistribution(entries) {
        const distribution = {};
        entries.forEach(entry => {
            distribution[entry.format] = (distribution[entry.format] || 0) + 1;
        });
        return distribution;
    }
    
    getTopErrors(entries, limit = 10) {
        const errorEntries = entries.filter(e => 
            e.level === 'ERROR' || e.status >= 400
        );
        
        const errorCounts = {};
        errorEntries.forEach(entry => {
            const key = entry.message || `HTTP ${entry.status}`;
            errorCounts[key] = (errorCounts[key] || 0) + 1;
        });
        
        return Object.entries(errorCounts)
            .sort(([,a], [,b]) => b - a)
            .slice(0, limit)
            .map(([error, count]) => ({ error, count }));
    }
    
    getRecommendations(stats, entries) {
        const recommendations = [];
        
        // 检查错误率
        const errorCount = stats.byLevel['ERROR'] || 0;
        const errorRate = errorCount / stats.total;
        if (errorRate > 0.1) {
            recommendations.push(`错误率过高 (${(errorRate * 100).toFixed(1)}%)，建议检查应用状态`);
        }
        
        // 检查是否有异常的时间分布
        const hours = Object.keys(stats.byHour);
        if (hours.length > 0) {
            const avgPerHour = stats.total / hours.length;
            const maxHour = Object.entries(stats.byHour)
                .sort(([,a], [,b]) => b - a)[0];
            
            if (maxHour[1] > avgPerHour * 3) {
                recommendations.push(`${maxHour[0]}时流量异常 (${maxHour[1]}条)，建议检查该时段的系统负载`);
            }
        }
        
        return recommendations;
    }
}

// 使用示例
const analyzer = new LogAnalyzer();

const sampleLog = `2023-12-01 10:30:45 [INFO] Application started
2023-12-01 10:31:00 [ERROR] Database connection failed
2023-12-01 10:31:15 [WARN] Retrying connection
2023-12-01 10:31:30 [INFO] Connection restored
192.168.1.100 - - [01/Dec/2023:10:32:00 +0000] "GET /api/users HTTP/1.1" 200 1234 "-" "Mozilla/5.0"
Dec  1 10:33:15 server01 nginx[1234]: SSL certificate expires soon`;

const analysis = analyzer.analyzeFile(sampleLog);
const report = analyzer.generateReport(analysis);

console.log('日志分析报告:', JSON.stringify(report, null, 2));

11.2 代码格式检查器

代码风格检查器

class CodeStyleChecker {
    constructor() {
        this.rules = {
            // JavaScript规则
            javascript: {
                semicolon: /^.*[^;]\s*$/,                    // 缺少分号
                indentation: /^(\s*)/,                       // 缩进检查
                trailingSpaces: /\s+$/,                      // 行末空格
                longLine: /.{81,}/,                          // 行过长
                console: /console\.(log|error|warn|info)/,   // console语句
                debugger: /\bdebugger\b/,                    // debugger语句
                todoComment: /\/\/\s*TODO:/i,                // TODO注释
                emptyLine: /^\s*$/,                          // 空行
                functionNaming: /function\s+([a-z][a-zA-Z0-9]*)/,  // 函数命名
                variableNaming: /(?:let|const|var)\s+([a-z][a-zA-Z0-9]*)/  // 变量命名
            }
        };
        
        this.config = {
            maxLineLength: 80,
            indentSize: 2,
            indentType: 'spaces', // 'spaces' or 'tabs'
            allowConsole: false,
            allowDebugger: false,
            enforceUpperCamelCase: true
        };
    }
    
    checkFile(code, language = 'javascript', filename = 'unknown') {
        const lines = code.split('\n');
        const issues = [];
        let stats = {
            lines: lines.length,
            emptyLines: 0,
            codeLines: 0,
            commentLines: 0,
            functions: 0,
            variables: 0
        };
        
        lines.forEach((line, index) => {
            const lineNumber = index + 1;
            const checks = this.checkLine(line, lineNumber, language);
            issues.push(...checks);
            
            this.updateStats(line, stats);
        });
        
        return {
            filename,
            language,
            stats,
            issues,
            score: this.calculateScore(issues, stats)
        };
    }
    
    checkLine(line, lineNumber, language) {
        const issues = [];
        const rules = this.rules[language];
        
        if (!rules) return issues;
        
        // 检查行长度
        if (line.length > this.config.maxLineLength) {
            issues.push({
                line: lineNumber,
                column: this.config.maxLineLength + 1,
                type: 'warning',
                rule: 'max-line-length',
                message: `行长度超过${this.config.maxLineLength}个字符`
            });
        }
        
        // 检查行末空格
        if (rules.trailingSpaces.test(line)) {
            issues.push({
                line: lineNumber,
                column: line.length,
                type: 'error',
                rule: 'trailing-spaces',
                message: '行末存在多余空格'
            });
        }
        
        // 检查缩进
        const indentMatch = rules.indentation.exec(line);
        if (indentMatch && line.trim()) {
            const indent = indentMatch[1];
            if (this.config.indentType === 'spaces' && indent.includes('\t')) {
                issues.push({
                    line: lineNumber,
                    column: 1,
                    type: 'error',
                    rule: 'indent-type',
                    message: '使用空格缩进，不要使用制表符'
                });
            } else if (this.config.indentType === 'tabs' && indent.includes(' ')) {
                issues.push({
                    line: lineNumber,
                    column: 1,
                    type: 'error',
                    rule: 'indent-type',
                    message: '使用制表符缩进，不要使用空格'
                });
            }
            
            // 检查缩进大小
            if (this.config.indentType === 'spaces') {
                const spaceCount = indent.length;
                if (spaceCount % this.config.indentSize !== 0) {
                    issues.push({
                        line: lineNumber,
                        column: 1,
                        type: 'error',
                        rule: 'indent-size',
                        message: `缩进应该是${this.config.indentSize}的倍数`
                    });
                }
            }
        }
        
        // 检查console语句
        if (!this.config.allowConsole && rules.console.test(line)) {
            issues.push({
                line: lineNumber,
                column: line.search(rules.console) + 1,
                type: 'warning',
                rule: 'no-console',
                message: '不应该在生产代码中使用console语句'
            });
        }
        
        // 检查debugger语句
        if (!this.config.allowDebugger && rules.debugger.test(line)) {
            issues.push({
                line: lineNumber,
                column: line.search(rules.debugger) + 1,
                type: 'error',
                rule: 'no-debugger',
                message: '不应该在生产代码中使用debugger语句'
            });
        }
        
        // 检查分号（JavaScript特定）
        if (language === 'javascript' && this.needsSemicolon(line)) {
            if (rules.semicolon.test(line)) {
                issues.push({
                    line: lineNumber,
                    column: line.length + 1,
                    type: 'error',
                    rule: 'semicolon',
                    message: '语句末尾缺少分号'
                });
            }
        }
        
        // 检查命名规范
        this.checkNaming(line, lineNumber, language, issues);
        
        return issues;
    }
    
    needsSemicolon(line) {
        const trimmed = line.trim();
        if (!trimmed) return false;
        
        // 这些语句不需要分号
        const noSemicolonPatterns = [
            /^\s*\/\//,           // 注释
            /^\s*\/\*/,           // 块注释开始
            /^\s*\*/,             // 块注释中间
            /^\s*\*\//,           // 块注释结束
            /\{$/,                // 以大括号结尾
            /\}$/,                // 以大括号结尾
            /^\s*if\s*\(/,        // if语句
            /^\s*else/,           // else语句
            /^\s*for\s*\(/,       // for循环
            /^\s*while\s*\(/,     // while循环
            /^\s*switch\s*\(/,    // switch语句
            /^\s*function/,       // 函数定义
            /^\s*class/,          // 类定义
        ];
        
        return !noSemicolonPatterns.some(pattern => pattern.test(trimmed));
    }
    
    checkNaming(line, lineNumber, language, issues) {
        const rules = this.rules[language];
        
        // 检查函数命名
        const funcMatch = rules.functionNaming.exec(line);
        if (funcMatch) {
            const funcName = funcMatch[1];
            if (this.config.enforceUpperCamelCase && !/^[a-z][a-zA-Z0-9]*$/.test(funcName)) {
                issues.push({
                    line: lineNumber,
                    column: line.indexOf(funcName) + 1,
                    type: 'warning',
                    rule: 'function-naming',
                    message: '函数名应使用小写字母开头的驼峰命名'
                });
            }
        }
        
        // 检查变量命名
        const varMatch = rules.variableNaming.exec(line);
        if (varMatch) {
            const varName = varMatch[1];
            if (this.config.enforceUpperCamelCase && !/^[a-z][a-zA-Z0-9]*$/.test(varName)) {
                issues.push({
                    line: lineNumber,
                    column: line.indexOf(varName) + 1,
                    type: 'warning',
                    rule: 'variable-naming',
                    message: '变量名应使用小写字母开头的驼峰命名'
                });
            }
        }
    }
    
    updateStats(line, stats) {
        if (line.trim() === '') {
            stats.emptyLines++;
        } else if (line.trim().startsWith('//') || line.trim().startsWith('/*')) {
            stats.commentLines++;
        } else {
            stats.codeLines++;
        }
        
        // 统计函数和变量
        if (/\bfunction\s+\w+/.test(line)) {
            stats.functions++;
        }
        if (/(?:let|const|var)\s+\w+/.test(line)) {
            stats.variables++;
        }
    }
    
    calculateScore(issues, stats) {
        const errorCount = issues.filter(i => i.type === 'error').length;
        const warningCount = issues.filter(i => i.type === 'warning').length;
        
        const totalLines = stats.codeLines + stats.commentLines; // 不包括空行
        if (totalLines === 0) return 100;
        
        const errorPenalty = errorCount * 5;
        const warningPenalty = warningCount * 2;
        const totalPenalty = errorPenalty + warningPenalty;
        
        const score = Math.max(0, 100 - (totalPenalty / totalLines) * 100);
        return Math.round(score);
    }
    
    generateReport(results) {
        const totalIssues = results.reduce((sum, r) => sum + r.issues.length, 0);
        const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
        
        const issuesByType = {};
        const issuesByRule = {};
        
        results.forEach(result => {
            result.issues.forEach(issue => {
                issuesByType[issue.type] = (issuesByType[issue.type] || 0) + 1;
                issuesByRule[issue.rule] = (issuesByRule[issue.rule] || 0) + 1;
            });
        });
        
        return {
            summary: {
                filesChecked: results.length,
                totalIssues,
                averageScore: Math.round(avgScore),
                issuesByType,
                issuesByRule: Object.entries(issuesByRule)
                    .sort(([,a], [,b]) => b - a)
                    .slice(0, 10) // 前10个最常见问题
            },
            files: results.sort((a, b) => a.score - b.score), // 按分数排序
            recommendations: this.getStyleRecommendations(issuesByRule, totalIssues)
        };
    }
    
    getStyleRecommendations(issuesByRule, totalIssues) {
        const recommendations = [];
        
        Object.entries(issuesByRule)
            .sort(([,a], [,b]) => b - a)
            .slice(0, 3)
            .forEach(([rule, count]) => {
                const percentage = (count / totalIssues * 100).toFixed(1);
                
                switch (rule) {
                    case 'trailing-spaces':
                        recommendations.push(`${percentage}% 的问题是行末空格，建议配置编辑器自动删除`);
                        break;
                    case 'semicolon':
                        recommendations.push(`${percentage}% 的问题是缺少分号，建议使用ESLint自动修复`);
                        break;
                    case 'no-console':
                        recommendations.push(`${percentage}% 的问题是console语句，建议替换为日志库`);
                        break;
                    case 'max-line-length':
                        recommendations.push(`${percentage}% 的问题是行过长，建议重构复杂表达式`);
                        break;
                    default:
                        recommendations.push(`最常见问题是 ${rule} (${percentage}%)`);
                }
            });
        
        return recommendations;
    }
}

// 使用示例
const checker = new CodeStyleChecker();

const sampleCode = `function helloWorld(){
    console.log("Hello World")    // 缺少分号，有行末空格
        let x=42;  // 缩进错误
    debugger; // 不应该有debugger
}

let VeryLongVariableName = "This is a very long line that exceeds the maximum line length limit and should be broken into multiple lines";
`;

const result = checker.checkFile(sampleCode, 'javascript', 'sample.js');
const report = checker.generateReport([result]);

console.log('代码检查报告:', JSON.stringify(report, null, 2));

11.3 文本内容提取工具

多格式文本提取器

class TextExtractor {
    constructor() {
        this.extractors = {
            html: this.extractFromHtml,
            markdown: this.extractFromMarkdown,
            xml: this.extractFromXml,
            json: this.extractFromJson,
            csv: this.extractFromCsv,
            log: this.extractFromLog
        };
        
        this.patterns = {
            email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
            phone: /\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b/g,
            url: /https?:\/\/(?:[-\w.])+(?::[0-9]+)?(?:\/(?:[\w\/_.])*)?(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?/g,
            ip: /\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b/g,
            date: /\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b|\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b/g,
            time: /\b\d{1,2}:\d{2}(?::\d{2})?\s?(?:[AP]M)?\b/gi,
            number: /\b\d+(?:\.\d+)?\b/g,
            currency: /\$\d+(?:\.\d{2})?|\d+(?:\.\d{2})?\s?(?:USD|EUR|GBP|CNY)/g,
            creditCard: /\b(?:\d{4}[-\s]?){3}\d{4}\b/g,
            hashtag: /#\w+/g,
            mention: /@\w+/g
        };
    }
    
    // 自动检测内容类型
    detectContentType(content) {
        const detectors = [
            { type: 'json', pattern: /^\s*\{.*\}\s*$/s },
            { type: 'xml', pattern: /^\s*<\?xml|<\w+.*>/i },
            { type: 'html', pattern: /<html|<head|<body|<div|<p>/i },
            { type: 'markdown', pattern: /^#{1,6}\s|^\*\s|\[.*\]\(.*\)/m },
            { type: 'csv', pattern: /^[^,\n]*,[^,\n]*,/m },
            { type: 'log', pattern: /\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}|\[\w+\]/m }
        ];
        
        for (const detector of detectors) {
            if (detector.pattern.test(content)) {
                return detector.type;
            }
        }
        
        return 'text';
    }
    
    // 提取指定类型的数据
    extract(content, types = null, contentType = null) {
        if (!contentType) {
            contentType = this.detectContentType(content);
        }
        
        let text = content;
        
        // 预处理：根据内容类型提取纯文本
        if (this.extractors[contentType]) {
            text = this.extractors[contentType].call(this, content);
        }
        
        const results = {
            contentType,
            originalLength: content.length,
            textLength: text.length,
            extracted: {}
        };
        
        // 提取指定类型的数据
        const extractTypes = types || Object.keys(this.patterns);
        
        extractTypes.forEach(type => {
            if (this.patterns[type]) {
                const matches = text.match(this.patterns[type]);
                results.extracted[type] = matches ? [...new Set(matches)] : []; // 去重
            }
        });
        
        return results;
    }
    
    // HTML文本提取
    extractFromHtml(html) {
        // 移除script和style标签内容
        let text = html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
        text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
        
        // 移除HTML标签
        text = text.replace(/<[^>]+>/g, ' ');
        
        // 解码HTML实体
        const entities = {
            '&amp;': '&',
            '&lt;': '<',
            '&gt;': '>',
            '&quot;': '"',
            '&#39;': "'",
            '&nbsp;': ' '
        };
        
        text = text.replace(/&\w+;|&#\d+;/g, match => entities[match] || match);
        
        // 清理空白字符
        return text.replace(/\s+/g, ' ').trim();
    }
    
    // Markdown文本提取
    extractFromMarkdown(markdown) {
        let text = markdown;
        
        // 移除代码块
        text = text.replace(/```[\s\S]*?```/g, '');
        text = text.replace(/`[^`]+`/g, '');
        
        // 移除链接，保留文本
        text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
        
        // 移除图片标记
        text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1');
        
        // 移除标题标记
        text = text.replace(/^#{1,6}\s*/gm, '');
        
        // 移除列表标记
        text = text.replace(/^[\s]*[-*+]\s*/gm, '');
        text = text.replace(/^\s*\d+\.\s*/gm, '');
        
        // 移除强调标记
        text = text.replace(/\*\*([^*]+)\*\*/g, '$1');
        text = text.replace(/\*([^*]+)\*/g, '$1');
        text = text.replace(/__([^_]+)__/g, '$1');
        text = text.replace(/_([^_]+)_/g, '$1');
        
        // 清理空白字符
        return text.replace(/\s+/g, ' ').trim();
    }
    
    // XML文本提取
    extractFromXml(xml) {
        // 移除XML声明和注释
        let text = xml.replace(/<\?xml[^>]*\?>/g, '');
        text = text.replace(/<!--[\s\S]*?-->/g, '');
        
        // 移除CDATA
        text = text.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1');
        
        // 移除标签，保留内容
        text = text.replace(/<[^>]+>/g, ' ');
        
        // 清理空白字符
        return text.replace(/\s+/g, ' ').trim();
    }
    
    // JSON文本提取
    extractFromJson(jsonStr) {
        try {
            const data = JSON.parse(jsonStr);
            const texts = [];
            
            const extractValue = (obj) => {
                if (typeof obj === 'string') {
                    texts.push(obj);
                } else if (typeof obj === 'object' && obj !== null) {
                    if (Array.isArray(obj)) {
                        obj.forEach(extractValue);
                    } else {
                        Object.values(obj).forEach(extractValue);
                    }
                }
            };
            
            extractValue(data);
            return texts.join(' ');
        } catch (e) {
            return jsonStr; // 如果不是有效JSON，返回原文本
        }
    }
    
    // CSV文本提取
    extractFromCsv(csv) {
        const lines = csv.split('\n');
        const allTexts = [];
        
        lines.forEach(line => {
            // 简单的CSV解析（不处理复杂的引号情况）
            const fields = line.split(',').map(field => 
                field.trim().replace(/^["']|["']$/g, '')
            );
            allTexts.push(...fields);
        });
        
        return allTexts.filter(text => text.length > 0).join(' ');
    }
    
    // 日志文本提取
    extractFromLog(log) {
        const lines = log.split('\n');
        const messages = [];
        
        lines.forEach(line => {
            // 提取日志消息部分（去除时间戳、级别等）
            const messageMatch = line.match(/\[(INFO|WARN|ERROR|DEBUG)\]\s*(.+)/) ||
                                line.match(/\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}.*?\s(.+)/) ||
                                [null, null, line];
            
            if (messageMatch && messageMatch[2]) {
                messages.push(messageMatch[2].trim());
            }
        });
        
        return messages.join(' ');
    }
    
    // 高级提取：根据自定义模式提取
    extractCustomPattern(text, pattern, options = {}) {
        const {
            flags = 'g',
            transform = null,
            filter = null,
            limit = null
        } = options;
        
        const regex = new RegExp(pattern, flags);
        let matches = [...text.matchAll(regex)];
        
        // 转换匹配结果
        if (transform) {
            matches = matches.map(transform);
        }
        
        // 过滤结果
        if (filter) {
            matches = matches.filter(filter);
        }
        
        // 限制结果数量
        if (limit) {
            matches = matches.slice(0, limit);
        }
        
        return matches;
    }
    
    // 生成提取报告
    generateReport(extractResults) {
        const report = {
            summary: {
                contentType: extractResults.contentType,
                originalLength: extractResults.originalLength,
                processedLength: extractResults.textLength,
                compressionRatio: (1 - extractResults.textLength / extractResults.originalLength).toFixed(2)
            },
            findings: {},
            statistics: {}
        };
        
        // 统计提取结果
        Object.entries(extractResults.extracted).forEach(([type, items]) => {
            report.findings[type] = {
                count: items.length,
                items: items.slice(0, 10), // 只显示前10个
                hasMore: items.length > 10
            };
            
            // 特殊统计
            if (type === 'email') {
                const domains = items.map(email => email.split('@')[1]);
                const domainCounts = {};
                domains.forEach(domain => {
                    domainCounts[domain] = (domainCounts[domain] || 0) + 1;
                });
                report.statistics.emailDomains = Object.entries(domainCounts)
                    .sort(([,a], [,b]) => b - a)
                    .slice(0, 5);
            }
            
            if (type === 'url') {
                const protocols = items.map(url => url.split(':')[0]);
                const protocolCounts = {};
                protocols.forEach(protocol => {
                    protocolCounts[protocol] = (protocolCounts[protocol] || 0) + 1;
                });
                report.statistics.urlProtocols = protocolCounts;
            }
        });
        
        return report;
    }
}

// 使用示例
const extractor = new TextExtractor();

const sampleHtml = `
<html>
<head><title>示例页面</title></head>
<body>
    <h1>联系我们</h1>
    <p>邮箱: info@example.com, support@example.org</p>
    <p>电话: +1-555-123-4567</p>
    <p>网站: https://example.com</p>
    <p>服务器IP: 192.168.1.100</p>
    <script>console.log('这是脚本');</script>
</body>
</html>
`;

const results = extractor.extract(sampleHtml);
const report = extractor.generateReport(results);

console.log('文本提取报告:', JSON.stringify(report, null, 2));

小结

本章通过三个综合实战项目展示了正则表达式在文本处理中的强大应用：

日志分析工具：
- 多格式日志解析
- 自动格式检测
- 统计分析和报告生成
- 查询和过滤功能
代码格式检查器：
- 多规则代码检查
- 可配置的检查选项
- 详细的问题报告
- 代码质量评分
文本内容提取工具：
- 多格式文本处理
- 智能内容识别
- 结构化数据提取
- 统计分析功能

这些工具展示了正则表达式在实际项目中的应用价值，为复杂的文本处理任务提供了高效的解决方案。