Chapter 11: Comprehensive Text Processing Projects

Haiyue
22min

Chapter 11: Comprehensive Text Processing Projects

Learning Objectives

  1. Implement log file analysis tool
  2. Create code format checker
  3. Develop text content extraction tool
  4. Design data cleaning and format conversion programs
  5. Build a simple template engine

11.1 Log File Analysis Tool

Log Parser

class LogAnalyzer {
    constructor() {
        // Support multiple log formats
        this.patterns = {
            // Apache access log
            apache: /^(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+|-) "([^"]*)" "([^"]*)"/,

            // Standard application log: 2023-12-01 10:30:45 [INFO] Message
            standard: /^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)$/,

            // Syslog format
            syslog: /^(\w{3} \d{2} \d{2}:\d{2}:\d{2}) (\S+) (\w+)(?:\[(\d+)\])?: (.+)$/,

            // JSON format
            json: /^\{.*\}$/
        };

        this.stats = {
            total: 0,
            byLevel: {},
            byHour: {},
            errors: []
        };
    }

    detectFormat(logLine) {
        for (const [format, pattern] of Object.entries(this.patterns)) {
            if (pattern.test(logLine)) {
                return format;
            }
        }
        return 'unknown';
    }

    parseLine(logLine, format = null) {
        if (!format) {
            format = this.detectFormat(logLine);
        }

        switch (format) {
            case 'apache':
                return this.parseApacheLog(logLine);
            case 'standard':
                return this.parseStandardLog(logLine);
            case 'syslog':
                return this.parseSyslog(logLine);
            case 'json':
                return this.parseJsonLog(logLine);
            default:
                return { format: 'unknown', raw: logLine };
        }
    }

    parseApacheLog(logLine) {
        const match = this.patterns.apache.exec(logLine);
        if (!match) return null;

        return {
            format: 'apache',
            ip: match[1],
            timestamp: match[2],
            method: match[3],
            url: match[4],
            protocol: match[5],
            status: parseInt(match[6]),
            size: match[7] === '-' ? 0 : parseInt(match[7]),
            referer: match[8],
            userAgent: match[9]
        };
    }

    parseStandardLog(logLine) {
        const match = this.patterns.standard.exec(logLine);
        if (!match) return null;

        return {
            format: 'standard',
            timestamp: match[1],
            level: match[2],
            message: match[3]
        };
    }

    parseSyslog(logLine) {
        const match = this.patterns.syslog.exec(logLine);
        if (!match) return null;

        return {
            format: 'syslog',
            timestamp: match[1],
            host: match[2],
            process: match[3],
            pid: match[4] || null,
            message: match[5]
        };
    }

    parseJsonLog(logLine) {
        try {
            const data = JSON.parse(logLine);
            return {
                format: 'json',
                ...data
            };
        } catch (e) {
            return null;
        }
    }

    analyzeFile(logContent) {
        const lines = logContent.split('\n').filter(line => line.trim());
        const results = [];

        this.stats = { total: 0, byLevel: {}, byHour: {}, errors: [] };

        lines.forEach((line, index) => {
            try {
                const parsed = this.parseLine(line);
                if (parsed) {
                    results.push(parsed);
                    this.updateStats(parsed, index);
                }
            } catch (error) {
                this.stats.errors.push({ line: index + 1, error: error.message });
            }
        });

        this.stats.total = results.length;
        return {
            entries: results,
            stats: this.stats
        };
    }

    updateStats(entry, lineIndex) {
        // Statistics by log level
        if (entry.level) {
            this.stats.byLevel[entry.level] = (this.stats.byLevel[entry.level] || 0) + 1;
        }

        // Statistics by hour
        if (entry.timestamp) {
            const hour = entry.timestamp.substring(11, 13);
            this.stats.byHour[hour] = (this.stats.byHour[hour] || 0) + 1;
        }
    }

    // Query functionality
    filter(entries, criteria) {
        return entries.filter(entry => {
            if (criteria.level && entry.level !== criteria.level) {
                return false;
            }

            if (criteria.timeRange) {
                const entryTime = new Date(entry.timestamp);
                if (entryTime < criteria.timeRange.start || entryTime > criteria.timeRange.end) {
                    return false;
                }
            }

            if (criteria.pattern) {
                const regex = new RegExp(criteria.pattern, 'i');
                if (!regex.test(entry.message || entry.raw || '')) {
                    return false;
                }
            }

            return true;
        });
    }

    // Generate report
    generateReport(analysis) {
        const { entries, stats } = analysis;

        return {
            summary: {
                totalEntries: stats.total,
                errorLines: stats.errors.length,
                timeSpan: this.getTimeSpan(entries),
                formats: this.getFormatDistribution(entries)
            },
            levelDistribution: stats.byLevel,
            hourlyDistribution: stats.byHour,
            topErrors: this.getTopErrors(entries),
            recommendations: this.getRecommendations(stats, entries)
        };
    }

    getTimeSpan(entries) {
        const timestamps = entries
            .map(e => e.timestamp)
            .filter(t => t)
            .map(t => new Date(t))
            .sort();

        if (timestamps.length === 0) return null;

        return {
            start: timestamps[0],
            end: timestamps[timestamps.length - 1],
            duration: timestamps[timestamps.length - 1] - timestamps[0]
        };
    }

    getFormatDistribution(entries) {
        const distribution = {};
        entries.forEach(entry => {
            distribution[entry.format] = (distribution[entry.format] || 0) + 1;
        });
        return distribution;
    }

    getTopErrors(entries, limit = 10) {
        const errorEntries = entries.filter(e =>
            e.level === 'ERROR' || e.status >= 400
        );

        const errorCounts = {};
        errorEntries.forEach(entry => {
            const key = entry.message || `HTTP ${entry.status}`;
            errorCounts[key] = (errorCounts[key] || 0) + 1;
        });

        return Object.entries(errorCounts)
            .sort(([,a], [,b]) => b - a)
            .slice(0, limit)
            .map(([error, count]) => ({ error, count }));
    }

    getRecommendations(stats, entries) {
        const recommendations = [];

        // Check error rate
        const errorCount = stats.byLevel['ERROR'] || 0;
        const errorRate = errorCount / stats.total;
        if (errorRate > 0.1) {
            recommendations.push(`Error rate too high (${(errorRate * 100).toFixed(1)}%), check application status`);
        }

        // Check for abnormal time distribution
        const hours = Object.keys(stats.byHour);
        if (hours.length > 0) {
            const avgPerHour = stats.total / hours.length;
            const maxHour = Object.entries(stats.byHour)
                .sort(([,a], [,b]) => b - a)[0];

            if (maxHour[1] > avgPerHour * 3) {
                recommendations.push(`Abnormal traffic at hour ${maxHour[0]} (${maxHour[1]} entries), check system load during this period`);
            }
        }

        return recommendations;
    }
}

// Usage example
const analyzer = new LogAnalyzer();

const sampleLog = `2023-12-01 10:30:45 [INFO] Application started
2023-12-01 10:31:00 [ERROR] Database connection failed
2023-12-01 10:31:15 [WARN] Retrying connection
2023-12-01 10:31:30 [INFO] Connection restored
192.168.1.100 - - [01/Dec/2023:10:32:00 +0000] "GET /api/users HTTP/1.1" 200 1234 "-" "Mozilla/5.0"
Dec  1 10:33:15 server01 nginx[1234]: SSL certificate expires soon`;

const analysis = analyzer.analyzeFile(sampleLog);
const report = analyzer.generateReport(analysis);

console.log('Log Analysis Report:', JSON.stringify(report, null, 2));

11.2 Code Format Checker

Code Style Checker

class CodeStyleChecker {
    constructor() {
        this.rules = {
            // JavaScript rules
            javascript: {
                semicolon: /^.*[^;]\s*$/,                    // Missing semicolon
                indentation: /^(\s*)/,                       // Indentation check
                trailingSpaces: /\s+$/,                      // Trailing spaces
                longLine: /.{81,}/,                          // Line too long
                console: /console\.(log|error|warn|info)/,   // Console statements
                debugger: /\bdebugger\b/,                    // Debugger statements
                todoComment: /\/\/\s*TODO:/i,                // TODO comments
                emptyLine: /^\s*$/,                          // Empty line
                functionNaming: /function\s+([a-z][a-zA-Z0-9]*)/,  // Function naming
                variableNaming: /(?:let|const|var)\s+([a-z][a-zA-Z0-9]*)/  // Variable naming
            }
        };

        this.config = {
            maxLineLength: 80,
            indentSize: 2,
            indentType: 'spaces', // 'spaces' or 'tabs'
            allowConsole: false,
            allowDebugger: false,
            enforceUpperCamelCase: true
        };
    }

    checkFile(code, language = 'javascript', filename = 'unknown') {
        const lines = code.split('\n');
        const issues = [];
        let stats = {
            lines: lines.length,
            emptyLines: 0,
            codeLines: 0,
            commentLines: 0,
            functions: 0,
            variables: 0
        };

        lines.forEach((line, index) => {
            const lineNumber = index + 1;
            const checks = this.checkLine(line, lineNumber, language);
            issues.push(...checks);

            this.updateStats(line, stats);
        });

        return {
            filename,
            language,
            stats,
            issues,
            score: this.calculateScore(issues, stats)
        };
    }

    checkLine(line, lineNumber, language) {
        const issues = [];
        const rules = this.rules[language];

        if (!rules) return issues;

        // Check line length
        if (line.length > this.config.maxLineLength) {
            issues.push({
                line: lineNumber,
                column: this.config.maxLineLength + 1,
                type: 'warning',
                rule: 'max-line-length',
                message: `Line length exceeds ${this.config.maxLineLength} characters`
            });
        }

        // Check trailing spaces
        if (rules.trailingSpaces.test(line)) {
            issues.push({
                line: lineNumber,
                column: line.length,
                type: 'error',
                rule: 'trailing-spaces',
                message: 'Trailing spaces detected'
            });
        }

        // Check indentation
        const indentMatch = rules.indentation.exec(line);
        if (indentMatch && line.trim()) {
            const indent = indentMatch[1];
            if (this.config.indentType === 'spaces' && indent.includes('\t')) {
                issues.push({
                    line: lineNumber,
                    column: 1,
                    type: 'error',
                    rule: 'indent-type',
                    message: 'Use spaces for indentation, not tabs'
                });
            } else if (this.config.indentType === 'tabs' && indent.includes(' ')) {
                issues.push({
                    line: lineNumber,
                    column: 1,
                    type: 'error',
                    rule: 'indent-type',
                    message: 'Use tabs for indentation, not spaces'
                });
            }

            // Check indentation size
            if (this.config.indentType === 'spaces') {
                const spaceCount = indent.length;
                if (spaceCount % this.config.indentSize !== 0) {
                    issues.push({
                        line: lineNumber,
                        column: 1,
                        type: 'error',
                        rule: 'indent-size',
                        message: `Indentation should be a multiple of ${this.config.indentSize}`
                    });
                }
            }
        }

        // Check console statements
        if (!this.config.allowConsole && rules.console.test(line)) {
            issues.push({
                line: lineNumber,
                column: line.search(rules.console) + 1,
                type: 'warning',
                rule: 'no-console',
                message: 'Console statements should not be used in production code'
            });
        }

        // Check debugger statements
        if (!this.config.allowDebugger && rules.debugger.test(line)) {
            issues.push({
                line: lineNumber,
                column: line.search(rules.debugger) + 1,
                type: 'error',
                rule: 'no-debugger',
                message: 'Debugger statements should not be used in production code'
            });
        }

        // Check semicolon (JavaScript specific)
        if (language === 'javascript' && this.needsSemicolon(line)) {
            if (rules.semicolon.test(line)) {
                issues.push({
                    line: lineNumber,
                    column: line.length + 1,
                    type: 'error',
                    rule: 'semicolon',
                    message: 'Missing semicolon at end of statement'
                });
            }
        }

        // Check naming conventions
        this.checkNaming(line, lineNumber, language, issues);

        return issues;
    }

    needsSemicolon(line) {
        const trimmed = line.trim();
        if (!trimmed) return false;

        // These statements don't need semicolons
        const noSemicolonPatterns = [
            /^\s*\/\//,           // Comment
            /^\s*\/\*/,           // Block comment start
            /^\s*\*/,             // Block comment middle
            /^\s*\*\//,           // Block comment end
            /\{$/,                // Ends with brace
            /\}$/,                // Ends with brace
            /^\s*if\s*\(/,        // if statement
            /^\s*else/,           // else statement
            /^\s*for\s*\(/,       // for loop
            /^\s*while\s*\(/,     // while loop
            /^\s*switch\s*\(/,    // switch statement
            /^\s*function/,       // Function definition
            /^\s*class/,          // Class definition
        ];

        return !noSemicolonPatterns.some(pattern => pattern.test(trimmed));
    }

    checkNaming(line, lineNumber, language, issues) {
        const rules = this.rules[language];

        // Check function naming
        const funcMatch = rules.functionNaming.exec(line);
        if (funcMatch) {
            const funcName = funcMatch[1];
            if (this.config.enforceUpperCamelCase && !/^[a-z][a-zA-Z0-9]*$/.test(funcName)) {
                issues.push({
                    line: lineNumber,
                    column: line.indexOf(funcName) + 1,
                    type: 'warning',
                    rule: 'function-naming',
                    message: 'Function name should use lowerCamelCase'
                });
            }
        }

        // Check variable naming
        const varMatch = rules.variableNaming.exec(line);
        if (varMatch) {
            const varName = varMatch[1];
            if (this.config.enforceUpperCamelCase && !/^[a-z][a-zA-Z0-9]*$/.test(varName)) {
                issues.push({
                    line: lineNumber,
                    column: line.indexOf(varName) + 1,
                    type: 'warning',
                    rule: 'variable-naming',
                    message: 'Variable name should use lowerCamelCase'
                });
            }
        }
    }

    updateStats(line, stats) {
        if (line.trim() === '') {
            stats.emptyLines++;
        } else if (line.trim().startsWith('//') || line.trim().startsWith('/*')) {
            stats.commentLines++;
        } else {
            stats.codeLines++;
        }

        // Count functions and variables
        if (/\bfunction\s+\w+/.test(line)) {
            stats.functions++;
        }
        if (/(?:let|const|var)\s+\w+/.test(line)) {
            stats.variables++;
        }
    }

    calculateScore(issues, stats) {
        const errorCount = issues.filter(i => i.type === 'error').length;
        const warningCount = issues.filter(i => i.type === 'warning').length;

        const totalLines = stats.codeLines + stats.commentLines; // Excluding empty lines
        if (totalLines === 0) return 100;

        const errorPenalty = errorCount * 5;
        const warningPenalty = warningCount * 2;
        const totalPenalty = errorPenalty + warningPenalty;

        const score = Math.max(0, 100 - (totalPenalty / totalLines) * 100);
        return Math.round(score);
    }

    generateReport(results) {
        const totalIssues = results.reduce((sum, r) => sum + r.issues.length, 0);
        const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;

        const issuesByType = {};
        const issuesByRule = {};

        results.forEach(result => {
            result.issues.forEach(issue => {
                issuesByType[issue.type] = (issuesByType[issue.type] || 0) + 1;
                issuesByRule[issue.rule] = (issuesByRule[issue.rule] || 0) + 1;
            });
        });

        return {
            summary: {
                filesChecked: results.length,
                totalIssues,
                averageScore: Math.round(avgScore),
                issuesByType,
                issuesByRule: Object.entries(issuesByRule)
                    .sort(([,a], [,b]) => b - a)
                    .slice(0, 10) // Top 10 most common issues
            },
            files: results.sort((a, b) => a.score - b.score), // Sort by score
            recommendations: this.getStyleRecommendations(issuesByRule, totalIssues)
        };
    }

    getStyleRecommendations(issuesByRule, totalIssues) {
        const recommendations = [];

        Object.entries(issuesByRule)
            .sort(([,a], [,b]) => b - a)
            .slice(0, 3)
            .forEach(([rule, count]) => {
                const percentage = (count / totalIssues * 100).toFixed(1);

                switch (rule) {
                    case 'trailing-spaces':
                        recommendations.push(`${percentage}% of issues are trailing spaces, configure editor to remove automatically`);
                        break;
                    case 'semicolon':
                        recommendations.push(`${percentage}% of issues are missing semicolons, use ESLint auto-fix`);
                        break;
                    case 'no-console':
                        recommendations.push(`${percentage}% of issues are console statements, replace with logging library`);
                        break;
                    case 'max-line-length':
                        recommendations.push(`${percentage}% of issues are lines too long, refactor complex expressions`);
                        break;
                    default:
                        recommendations.push(`Most common issue is ${rule} (${percentage}%)`);
                }
            });

        return recommendations;
    }
}

// Usage example
const checker = new CodeStyleChecker();

const sampleCode = `function helloWorld(){
    console.log("Hello World")    // Missing semicolon, has trailing spaces
        let x=42;  // Indentation error
    debugger; // Should not have debugger
}

let VeryLongVariableName = "This is a very long line that exceeds the maximum line length limit and should be broken into multiple lines";
`;

const result = checker.checkFile(sampleCode, 'javascript', 'sample.js');
const report = checker.generateReport([result]);

console.log('Code Check Report:', JSON.stringify(report, null, 2));

Summary

This chapter demonstrated the powerful application of regular expressions in text processing through three comprehensive projects:

  1. Log Analysis Tool:

    • Multi-format log parsing
    • Automatic format detection
    • Statistical analysis and report generation
    • Query and filter functionality
  2. Code Format Checker:

    • Multi-rule code checking
    • Configurable check options
    • Detailed issue reporting
    • Code quality scoring
  3. Text Content Extraction Tool:

    • Multi-format text processing
    • Intelligent content recognition
    • Structured data extraction
    • Statistical analysis functionality

These tools demonstrate the application value of regular expressions in real projects, providing efficient solutions for complex text processing tasks.