Chapter 11: Comprehensive Text Processing Projects
Haiyue
22min
Chapter 11: Comprehensive Text Processing Projects
Learning Objectives
- Implement log file analysis tool
- Create code format checker
- Develop text content extraction tool
- Design data cleaning and format conversion programs
- Build a simple template engine
11.1 Log File Analysis Tool
Log Parser
class LogAnalyzer {
constructor() {
// Support multiple log formats
this.patterns = {
// Apache access log
apache: /^(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+|-) "([^"]*)" "([^"]*)"/,
// Standard application log: 2023-12-01 10:30:45 [INFO] Message
standard: /^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)$/,
// Syslog format
syslog: /^(\w{3} \d{2} \d{2}:\d{2}:\d{2}) (\S+) (\w+)(?:\[(\d+)\])?: (.+)$/,
// JSON format
json: /^\{.*\}$/
};
this.stats = {
total: 0,
byLevel: {},
byHour: {},
errors: []
};
}
detectFormat(logLine) {
for (const [format, pattern] of Object.entries(this.patterns)) {
if (pattern.test(logLine)) {
return format;
}
}
return 'unknown';
}
parseLine(logLine, format = null) {
if (!format) {
format = this.detectFormat(logLine);
}
switch (format) {
case 'apache':
return this.parseApacheLog(logLine);
case 'standard':
return this.parseStandardLog(logLine);
case 'syslog':
return this.parseSyslog(logLine);
case 'json':
return this.parseJsonLog(logLine);
default:
return { format: 'unknown', raw: logLine };
}
}
parseApacheLog(logLine) {
const match = this.patterns.apache.exec(logLine);
if (!match) return null;
return {
format: 'apache',
ip: match[1],
timestamp: match[2],
method: match[3],
url: match[4],
protocol: match[5],
status: parseInt(match[6]),
size: match[7] === '-' ? 0 : parseInt(match[7]),
referer: match[8],
userAgent: match[9]
};
}
parseStandardLog(logLine) {
const match = this.patterns.standard.exec(logLine);
if (!match) return null;
return {
format: 'standard',
timestamp: match[1],
level: match[2],
message: match[3]
};
}
parseSyslog(logLine) {
const match = this.patterns.syslog.exec(logLine);
if (!match) return null;
return {
format: 'syslog',
timestamp: match[1],
host: match[2],
process: match[3],
pid: match[4] || null,
message: match[5]
};
}
parseJsonLog(logLine) {
try {
const data = JSON.parse(logLine);
return {
format: 'json',
...data
};
} catch (e) {
return null;
}
}
analyzeFile(logContent) {
const lines = logContent.split('\n').filter(line => line.trim());
const results = [];
this.stats = { total: 0, byLevel: {}, byHour: {}, errors: [] };
lines.forEach((line, index) => {
try {
const parsed = this.parseLine(line);
if (parsed) {
results.push(parsed);
this.updateStats(parsed, index);
}
} catch (error) {
this.stats.errors.push({ line: index + 1, error: error.message });
}
});
this.stats.total = results.length;
return {
entries: results,
stats: this.stats
};
}
updateStats(entry, lineIndex) {
// Statistics by log level
if (entry.level) {
this.stats.byLevel[entry.level] = (this.stats.byLevel[entry.level] || 0) + 1;
}
// Statistics by hour
if (entry.timestamp) {
const hour = entry.timestamp.substring(11, 13);
this.stats.byHour[hour] = (this.stats.byHour[hour] || 0) + 1;
}
}
// Query functionality
filter(entries, criteria) {
return entries.filter(entry => {
if (criteria.level && entry.level !== criteria.level) {
return false;
}
if (criteria.timeRange) {
const entryTime = new Date(entry.timestamp);
if (entryTime < criteria.timeRange.start || entryTime > criteria.timeRange.end) {
return false;
}
}
if (criteria.pattern) {
const regex = new RegExp(criteria.pattern, 'i');
if (!regex.test(entry.message || entry.raw || '')) {
return false;
}
}
return true;
});
}
// Generate report
generateReport(analysis) {
const { entries, stats } = analysis;
return {
summary: {
totalEntries: stats.total,
errorLines: stats.errors.length,
timeSpan: this.getTimeSpan(entries),
formats: this.getFormatDistribution(entries)
},
levelDistribution: stats.byLevel,
hourlyDistribution: stats.byHour,
topErrors: this.getTopErrors(entries),
recommendations: this.getRecommendations(stats, entries)
};
}
getTimeSpan(entries) {
const timestamps = entries
.map(e => e.timestamp)
.filter(t => t)
.map(t => new Date(t))
.sort();
if (timestamps.length === 0) return null;
return {
start: timestamps[0],
end: timestamps[timestamps.length - 1],
duration: timestamps[timestamps.length - 1] - timestamps[0]
};
}
getFormatDistribution(entries) {
const distribution = {};
entries.forEach(entry => {
distribution[entry.format] = (distribution[entry.format] || 0) + 1;
});
return distribution;
}
getTopErrors(entries, limit = 10) {
const errorEntries = entries.filter(e =>
e.level === 'ERROR' || e.status >= 400
);
const errorCounts = {};
errorEntries.forEach(entry => {
const key = entry.message || `HTTP ${entry.status}`;
errorCounts[key] = (errorCounts[key] || 0) + 1;
});
return Object.entries(errorCounts)
.sort(([,a], [,b]) => b - a)
.slice(0, limit)
.map(([error, count]) => ({ error, count }));
}
getRecommendations(stats, entries) {
const recommendations = [];
// Check error rate
const errorCount = stats.byLevel['ERROR'] || 0;
const errorRate = errorCount / stats.total;
if (errorRate > 0.1) {
recommendations.push(`Error rate too high (${(errorRate * 100).toFixed(1)}%), check application status`);
}
// Check for abnormal time distribution
const hours = Object.keys(stats.byHour);
if (hours.length > 0) {
const avgPerHour = stats.total / hours.length;
const maxHour = Object.entries(stats.byHour)
.sort(([,a], [,b]) => b - a)[0];
if (maxHour[1] > avgPerHour * 3) {
recommendations.push(`Abnormal traffic at hour ${maxHour[0]} (${maxHour[1]} entries), check system load during this period`);
}
}
return recommendations;
}
}
// Usage example
const analyzer = new LogAnalyzer();
const sampleLog = `2023-12-01 10:30:45 [INFO] Application started
2023-12-01 10:31:00 [ERROR] Database connection failed
2023-12-01 10:31:15 [WARN] Retrying connection
2023-12-01 10:31:30 [INFO] Connection restored
192.168.1.100 - - [01/Dec/2023:10:32:00 +0000] "GET /api/users HTTP/1.1" 200 1234 "-" "Mozilla/5.0"
Dec 1 10:33:15 server01 nginx[1234]: SSL certificate expires soon`;
const analysis = analyzer.analyzeFile(sampleLog);
const report = analyzer.generateReport(analysis);
console.log('Log Analysis Report:', JSON.stringify(report, null, 2));
11.2 Code Format Checker
Code Style Checker
class CodeStyleChecker {
constructor() {
this.rules = {
// JavaScript rules
javascript: {
semicolon: /^.*[^;]\s*$/, // Missing semicolon
indentation: /^(\s*)/, // Indentation check
trailingSpaces: /\s+$/, // Trailing spaces
longLine: /.{81,}/, // Line too long
console: /console\.(log|error|warn|info)/, // Console statements
debugger: /\bdebugger\b/, // Debugger statements
todoComment: /\/\/\s*TODO:/i, // TODO comments
emptyLine: /^\s*$/, // Empty line
functionNaming: /function\s+([a-z][a-zA-Z0-9]*)/, // Function naming
variableNaming: /(?:let|const|var)\s+([a-z][a-zA-Z0-9]*)/ // Variable naming
}
};
this.config = {
maxLineLength: 80,
indentSize: 2,
indentType: 'spaces', // 'spaces' or 'tabs'
allowConsole: false,
allowDebugger: false,
enforceUpperCamelCase: true
};
}
checkFile(code, language = 'javascript', filename = 'unknown') {
const lines = code.split('\n');
const issues = [];
let stats = {
lines: lines.length,
emptyLines: 0,
codeLines: 0,
commentLines: 0,
functions: 0,
variables: 0
};
lines.forEach((line, index) => {
const lineNumber = index + 1;
const checks = this.checkLine(line, lineNumber, language);
issues.push(...checks);
this.updateStats(line, stats);
});
return {
filename,
language,
stats,
issues,
score: this.calculateScore(issues, stats)
};
}
checkLine(line, lineNumber, language) {
const issues = [];
const rules = this.rules[language];
if (!rules) return issues;
// Check line length
if (line.length > this.config.maxLineLength) {
issues.push({
line: lineNumber,
column: this.config.maxLineLength + 1,
type: 'warning',
rule: 'max-line-length',
message: `Line length exceeds ${this.config.maxLineLength} characters`
});
}
// Check trailing spaces
if (rules.trailingSpaces.test(line)) {
issues.push({
line: lineNumber,
column: line.length,
type: 'error',
rule: 'trailing-spaces',
message: 'Trailing spaces detected'
});
}
// Check indentation
const indentMatch = rules.indentation.exec(line);
if (indentMatch && line.trim()) {
const indent = indentMatch[1];
if (this.config.indentType === 'spaces' && indent.includes('\t')) {
issues.push({
line: lineNumber,
column: 1,
type: 'error',
rule: 'indent-type',
message: 'Use spaces for indentation, not tabs'
});
} else if (this.config.indentType === 'tabs' && indent.includes(' ')) {
issues.push({
line: lineNumber,
column: 1,
type: 'error',
rule: 'indent-type',
message: 'Use tabs for indentation, not spaces'
});
}
// Check indentation size
if (this.config.indentType === 'spaces') {
const spaceCount = indent.length;
if (spaceCount % this.config.indentSize !== 0) {
issues.push({
line: lineNumber,
column: 1,
type: 'error',
rule: 'indent-size',
message: `Indentation should be a multiple of ${this.config.indentSize}`
});
}
}
}
// Check console statements
if (!this.config.allowConsole && rules.console.test(line)) {
issues.push({
line: lineNumber,
column: line.search(rules.console) + 1,
type: 'warning',
rule: 'no-console',
message: 'Console statements should not be used in production code'
});
}
// Check debugger statements
if (!this.config.allowDebugger && rules.debugger.test(line)) {
issues.push({
line: lineNumber,
column: line.search(rules.debugger) + 1,
type: 'error',
rule: 'no-debugger',
message: 'Debugger statements should not be used in production code'
});
}
// Check semicolon (JavaScript specific)
if (language === 'javascript' && this.needsSemicolon(line)) {
if (rules.semicolon.test(line)) {
issues.push({
line: lineNumber,
column: line.length + 1,
type: 'error',
rule: 'semicolon',
message: 'Missing semicolon at end of statement'
});
}
}
// Check naming conventions
this.checkNaming(line, lineNumber, language, issues);
return issues;
}
needsSemicolon(line) {
const trimmed = line.trim();
if (!trimmed) return false;
// These statements don't need semicolons
const noSemicolonPatterns = [
/^\s*\/\//, // Comment
/^\s*\/\*/, // Block comment start
/^\s*\*/, // Block comment middle
/^\s*\*\//, // Block comment end
/\{$/, // Ends with brace
/\}$/, // Ends with brace
/^\s*if\s*\(/, // if statement
/^\s*else/, // else statement
/^\s*for\s*\(/, // for loop
/^\s*while\s*\(/, // while loop
/^\s*switch\s*\(/, // switch statement
/^\s*function/, // Function definition
/^\s*class/, // Class definition
];
return !noSemicolonPatterns.some(pattern => pattern.test(trimmed));
}
checkNaming(line, lineNumber, language, issues) {
const rules = this.rules[language];
// Check function naming
const funcMatch = rules.functionNaming.exec(line);
if (funcMatch) {
const funcName = funcMatch[1];
if (this.config.enforceUpperCamelCase && !/^[a-z][a-zA-Z0-9]*$/.test(funcName)) {
issues.push({
line: lineNumber,
column: line.indexOf(funcName) + 1,
type: 'warning',
rule: 'function-naming',
message: 'Function name should use lowerCamelCase'
});
}
}
// Check variable naming
const varMatch = rules.variableNaming.exec(line);
if (varMatch) {
const varName = varMatch[1];
if (this.config.enforceUpperCamelCase && !/^[a-z][a-zA-Z0-9]*$/.test(varName)) {
issues.push({
line: lineNumber,
column: line.indexOf(varName) + 1,
type: 'warning',
rule: 'variable-naming',
message: 'Variable name should use lowerCamelCase'
});
}
}
}
updateStats(line, stats) {
if (line.trim() === '') {
stats.emptyLines++;
} else if (line.trim().startsWith('//') || line.trim().startsWith('/*')) {
stats.commentLines++;
} else {
stats.codeLines++;
}
// Count functions and variables
if (/\bfunction\s+\w+/.test(line)) {
stats.functions++;
}
if (/(?:let|const|var)\s+\w+/.test(line)) {
stats.variables++;
}
}
calculateScore(issues, stats) {
const errorCount = issues.filter(i => i.type === 'error').length;
const warningCount = issues.filter(i => i.type === 'warning').length;
const totalLines = stats.codeLines + stats.commentLines; // Excluding empty lines
if (totalLines === 0) return 100;
const errorPenalty = errorCount * 5;
const warningPenalty = warningCount * 2;
const totalPenalty = errorPenalty + warningPenalty;
const score = Math.max(0, 100 - (totalPenalty / totalLines) * 100);
return Math.round(score);
}
generateReport(results) {
const totalIssues = results.reduce((sum, r) => sum + r.issues.length, 0);
const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
const issuesByType = {};
const issuesByRule = {};
results.forEach(result => {
result.issues.forEach(issue => {
issuesByType[issue.type] = (issuesByType[issue.type] || 0) + 1;
issuesByRule[issue.rule] = (issuesByRule[issue.rule] || 0) + 1;
});
});
return {
summary: {
filesChecked: results.length,
totalIssues,
averageScore: Math.round(avgScore),
issuesByType,
issuesByRule: Object.entries(issuesByRule)
.sort(([,a], [,b]) => b - a)
.slice(0, 10) // Top 10 most common issues
},
files: results.sort((a, b) => a.score - b.score), // Sort by score
recommendations: this.getStyleRecommendations(issuesByRule, totalIssues)
};
}
getStyleRecommendations(issuesByRule, totalIssues) {
const recommendations = [];
Object.entries(issuesByRule)
.sort(([,a], [,b]) => b - a)
.slice(0, 3)
.forEach(([rule, count]) => {
const percentage = (count / totalIssues * 100).toFixed(1);
switch (rule) {
case 'trailing-spaces':
recommendations.push(`${percentage}% of issues are trailing spaces, configure editor to remove automatically`);
break;
case 'semicolon':
recommendations.push(`${percentage}% of issues are missing semicolons, use ESLint auto-fix`);
break;
case 'no-console':
recommendations.push(`${percentage}% of issues are console statements, replace with logging library`);
break;
case 'max-line-length':
recommendations.push(`${percentage}% of issues are lines too long, refactor complex expressions`);
break;
default:
recommendations.push(`Most common issue is ${rule} (${percentage}%)`);
}
});
return recommendations;
}
}
// Usage example
const checker = new CodeStyleChecker();
const sampleCode = `function helloWorld(){
console.log("Hello World") // Missing semicolon, has trailing spaces
let x=42; // Indentation error
debugger; // Should not have debugger
}
let VeryLongVariableName = "This is a very long line that exceeds the maximum line length limit and should be broken into multiple lines";
`;
const result = checker.checkFile(sampleCode, 'javascript', 'sample.js');
const report = checker.generateReport([result]);
console.log('Code Check Report:', JSON.stringify(report, null, 2));
Summary
This chapter demonstrated the powerful application of regular expressions in text processing through three comprehensive projects:
-
Log Analysis Tool:
- Multi-format log parsing
- Automatic format detection
- Statistical analysis and report generation
- Query and filter functionality
-
Code Format Checker:
- Multi-rule code checking
- Configurable check options
- Detailed issue reporting
- Code quality scoring
-
Text Content Extraction Tool:
- Multi-format text processing
- Intelligent content recognition
- Structured data extraction
- Statistical analysis functionality
These tools demonstrate the application value of regular expressions in real projects, providing efficient solutions for complex text processing tasks.