Chapter 12: Advanced Knowledge and Applications
Chapter 12: Advanced Knowledge and Applications
Learning Objectives
- Understand Unicode and multi-language text processing
- Master recursive regular expression concepts
- Learn regular expression replacement and callbacks
- Understand finite state automaton principles
- Explore limitations and alternatives to regular expressions
12.1 Unicode and Multi-language Text Processing
Unicode Basics
// Different representations of Unicode characters
const examples = {
// Basic Chinese characters
chinese: 'Hello World',
// Accented characters
accented: 'café naïve résumé',
// emoji
emoji: '😀🎉🌟💖',
// Composite characters (family emoji)
composite: '👨👩👧👦',
// Mixed languages
mixed: 'Hello 你好 Bonjour مرحبا'
};
// View Unicode information of characters
function analyzeUnicode(text) {
return Array.from(text).map(char => ({
char: char,
codePoint: char.codePointAt(0),
hexCode: char.codePointAt(0).toString(16).toUpperCase().padStart(4, '0'),
category: getUnicodeCategory(char),
length: char.length
}));
}
function getUnicodeCategory(char) {
const code = char.codePointAt(0);
if (code >= 0x0030 && code <= 0x0039) return 'Digit';
if (code >= 0x0041 && code <= 0x005A) return 'Uppercase Letter';
if (code >= 0x0061 && code <= 0x007A) return 'Lowercase Letter';
if (code >= 0x4E00 && code <= 0x9FFF) return 'CJK Ideograph';
if (code >= 0x1F600 && code <= 0x1F64F) return 'Emoticons';
return 'Other';
}
console.log('Unicode Analysis:', analyzeUnicode('Hello 你好 😀'));
Unicode Property Matching
// Unicode property patterns (ES2018+)
class UnicodeTextProcessor {
constructor() {
this.patterns = {
// Match all letters
letters: /\p{Letter}/u,
// Match digits
numbers: /\p{Number}/u,
// Match Chinese characters
chinese: /\p{Script=Han}/u,
// Match emoji
emoji: /\p{Emoji}/u,
// Match Arabic text
arabic: /\p{Script=Arabic}/u,
// Match punctuation
punctuation: /\p{Punctuation}/u,
// Match currency symbols
currency: /\p{Currency_Symbol}/u,
// Match math symbols
math: /\p{Math_Symbol}/u,
// Match whitespace
whitespace: /\p{White_Space}/u,
// Match uppercase letters
uppercase: /\p{Uppercase_Letter}/u,
// Match lowercase letters
lowercase: /\p{Lowercase_Letter}/u
};
}
// Analyze language composition of text
analyzeLanguages(text) {
const scripts = {};
const patterns = {
latin: /\p{Script=Latin}/u,
han: /\p{Script=Han}/u,
arabic: /\p{Script=Arabic}/u,
cyrillic: /\p{Script=Cyrillic}/u,
greek: /\p{Script=Greek}/u,
hebrew: /\p{Script=Hebrew}/u,
japanese: /[\p{Script=Hiragana}\p{Script=Katakana}]/u,
korean: /\p{Script=Hangul}/u,
thai: /\p{Script=Thai}/u,
devanagari: /\p{Script=Devanagari}/u
};
Object.entries(patterns).forEach(([script, pattern]) => {
const matches = text.match(new RegExp(pattern, 'gu'));
if (matches) {
scripts[script] = matches.length;
}
});
return scripts;
}
// Extract text by specific script
extractByScript(text, script) {
const patterns = {
chinese: /[\p{Script=Han}]+/gu,
english: /[\p{Script=Latin}\s]+/gu,
arabic: /[\p{Script=Arabic}\s]+/gu,
russian: /[\p{Script=Cyrillic}\s]+/gu,
japanese: /[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}]+/gu,
korean: /[\p{Script=Hangul}]+/gu
};
const pattern = patterns[script.toLowerCase()];
if (!pattern) return [];
return text.match(pattern) || [];
}
// Clean and normalize text
normalizeText(text, options = {}) {
const {
removeEmoji = false,
removeAccents = false,
lowercaseOnly = false,
removeExtraSpaces = true,
normalizeForm = 'NFC' // NFC, NFD, NFKC, NFKD
} = options;
let normalized = text;
// Unicode normalization
normalized = normalized.normalize(normalizeForm);
// Remove emoji
if (removeEmoji) {
normalized = normalized.replace(/\p{Emoji}/gu, '');
}
// Remove accents
if (removeAccents) {
normalized = normalized.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
}
// Convert to lowercase
if (lowercaseOnly) {
normalized = normalized.toLowerCase();
}
// Clean extra spaces
if (removeExtraSpaces) {
normalized = normalized.replace(/\s+/g, ' ').trim();
}
return normalized;
}
}
// Usage example
const processor = new UnicodeTextProcessor();
const multilingualText = "Hello 你好 مرحبا Здравствуйте こんにちは 안녕하세요 🌍";
console.log('Language analysis:', processor.analyzeLanguages(multilingualText));
console.log('Chinese extraction:', processor.extractByScript(multilingualText, 'chinese'));
console.log('English extraction:', processor.extractByScript(multilingualText, 'english'));
const normalizedText = processor.normalizeText("Café naïve résumé 🎉", {
removeAccents: true,
removeEmoji: true,
lowercaseOnly: true
});
console.log('Normalized result:', normalizedText);
12.2 Recursive Regular Expressions
Although JavaScript doesn’t natively support recursive regular expressions, we can understand the concept and implement similar functionality in other ways.
Recursive Matching Concept
// Recursive regex concept (PCRE syntax, not supported in JavaScript)
// Match balanced parentheses: \((?:[^()]|(?R))*\)
// Match nested HTML tags: <(\w+)(?:[^<>]|(?R))*</\1>
// Alternative implementation in JavaScript
class RecursivePatternMatcher {
// Match balanced parentheses
matchBalancedParentheses(text) {
const results = [];
let depth = 0;
let start = -1;
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (char === '(') {
if (depth === 0) {
start = i;
}
depth++;
} else if (char === ')') {
depth--;
if (depth === 0 && start !== -1) {
results.push({
match: text.substring(start, i + 1),
start: start,
end: i + 1
});
start = -1;
} else if (depth < 0) {
depth = 0; // Reset for unbalanced cases
}
}
}
return results;
}
// Match nested JSON objects
matchNestedJson(text) {
const results = [];
let braceDepth = 0;
let bracketDepth = 0;
let start = -1;
let inString = false;
let escaped = false;
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (!inString) {
if (char === '{') {
if (braceDepth === 0 && bracketDepth === 0) {
start = i;
}
braceDepth++;
} else if (char === '}') {
braceDepth--;
if (braceDepth === 0 && bracketDepth === 0 && start !== -1) {
try {
const jsonStr = text.substring(start, i + 1);
JSON.parse(jsonStr); // Validate JSON
results.push({
match: jsonStr,
start: start,
end: i + 1
});
} catch (e) {
// Not valid JSON, ignore
}
start = -1;
}
} else if (char === '[') {
bracketDepth++;
} else if (char === ']') {
bracketDepth--;
} else if (char === '"' && !escaped) {
inString = true;
}
} else {
if (char === '"' && !escaped) {
inString = false;
} else if (char === '\\') {
escaped = !escaped;
continue;
}
}
escaped = false;
}
return results;
}
// Match nested XML/HTML tags
matchNestedTags(text, tagName = null) {
const results = [];
const stack = [];
const tagPattern = /<(\/?)([\w-]+)(?:\s[^>]*)?>/g;
let match;
while ((match = tagPattern.exec(text)) !== null) {
const isClosing = match[1] === '/';
const currentTag = match[2].toLowerCase();
const fullMatch = match[0];
const position = match.index;
if (tagName && currentTag !== tagName.toLowerCase()) {
continue;
}
if (isClosing) {
// Find matching opening tag
for (let i = stack.length - 1; i >= 0; i--) {
if (stack[i].tag === currentTag) {
const startTag = stack[i];
const content = text.substring(startTag.end, position);
results.push({
tag: currentTag,
fullMatch: text.substring(startTag.start, position + fullMatch.length),
content: content,
start: startTag.start,
end: position + fullMatch.length,
depth: stack.length - 1
});
// Remove matched tag
stack.splice(i);
break;
}
}
} else {
// Check for self-closing tag
if (!fullMatch.endsWith('/>')) {
stack.push({
tag: currentTag,
start: position,
end: position + fullMatch.length
});
}
}
}
return results;
}
}
// Usage example
const matcher = new RecursivePatternMatcher();
// Test balanced parentheses matching
const parenthesesText = "Function call: func(a, func2(b, c), d) and another(x, y)";
console.log('Parentheses match:', matcher.matchBalancedParentheses(parenthesesText));
// Test JSON matching
const jsonText = 'Here is JSON: {"name": "test", "nested": {"value": 123}} and another {"simple": true}';
console.log('JSON match:', matcher.matchNestedJson(jsonText));
// Test HTML tag matching
const htmlText = '<div>Outer <p>Inner content</p> More content <span>Another inner</span></div>';
console.log('HTML match:', matcher.matchNestedTags(htmlText));
12.3 Advanced Replacement Techniques
class AdvancedReplacer {
constructor() {
this.templates = new Map();
this.transformers = new Map();
}
// Template replacement
replaceWithTemplate(text, pattern, template) {
return text.replace(pattern, (match, ...groups) => {
let result = template;
// Replace $0, $1, $2 placeholders
result = result.replace(/\$(\d+)/g, (_, index) => {
const groupIndex = parseInt(index);
if (groupIndex === 0) return match;
return groups[groupIndex - 1] || '';
});
return result;
});
}
// Conditional replacement
conditionalReplace(text, rules) {
let result = text;
rules.forEach(rule => {
const { pattern, condition, replacement, otherwise } = rule;
result = result.replace(pattern, (match, ...groups) => {
const shouldReplace = typeof condition === 'function'
? condition(match, groups)
: condition;
if (shouldReplace) {
return typeof replacement === 'function'
? replacement(match, groups)
: replacement;
} else {
return otherwise || match;
}
});
});
return result;
}
// Counter replacement
replaceWithCounter(text, pattern, callback) {
let count = 0;
return {
result: text.replace(pattern, (match, ...groups) => {
return callback(match, groups, count++);
}),
count: count
};
}
// Smart formatting
smartFormat(text, formatters) {
let result = text;
Object.entries(formatters).forEach(([type, formatter]) => {
switch (type) {
case 'phone':
result = this.formatPhoneNumbers(result, formatter);
break;
case 'date':
result = this.formatDates(result, formatter);
break;
case 'currency':
result = this.formatCurrency(result, formatter);
break;
case 'url':
result = this.formatUrls(result, formatter);
break;
case 'email':
result = this.formatEmails(result, formatter);
break;
}
});
return result;
}
formatPhoneNumbers(text, formatter) {
const phonePattern = /\b(\d{3})[-.\s]?(\d{3})[-.\s]?(\d{4})\b/g;
return text.replace(phonePattern, (match, area, exchange, number) => {
switch (formatter.format) {
case 'dots':
return `${area}.${exchange}.${number}`;
case 'dashes':
return `${area}-${exchange}-${number}`;
case 'parentheses':
return `(${area}) ${exchange}-${number}`;
case 'international':
return `+1-${area}-${exchange}-${number}`;
default:
return match;
}
});
}
}
// Usage example
const replacer = new AdvancedReplacer();
// Template replacement example
const templateText = "User John's email is john@example.com";
const templateResult = replacer.replaceWithTemplate(
templateText,
/User (\w+)'s email is ([^\s]+)/,
"User: $1, Email: $2"
);
console.log('Template replacement:', templateResult);
// Conditional replacement example
const conditionalText = "Price: $10, $25, $100, $5";
const conditionalResult = replacer.conditionalReplace(conditionalText, [
{
pattern: /\$(\d+)/g,
condition: (match, groups) => parseInt(groups[0]) > 20,
replacement: (match, groups) => `💰${groups[0]}`,
otherwise: match
}
]);
console.log('Conditional replacement:', conditionalResult);
12.4 Limitations of Regular Expressions
Problems Regular Expressions Cannot Solve
class RegexLimitations {
// 1. Cannot match nested structures (native JavaScript)
// Example: balanced parentheses, nested HTML tags
// 2. Cannot perform arithmetic operations
cannotDoMath() {
// Cannot verify if mathematical expression is correct
const mathExpression = "2 + 3 * 4 = 14";
// Can only verify format, not calculation result
const formatPattern = /^\d+\s*[+\-*/]\s*\d+(?:\s*[+\-*/]\s*\d+)*\s*=\s*\d+$/;
return {
formatValid: formatPattern.test(mathExpression),
note: "Regular expressions cannot verify mathematical calculation correctness"
};
}
// 3. Cannot handle context-sensitive grammar
cannotHandleContextSensitive() {
// Example: variable declaration and usage in programming languages
const code = `
let x = 10;
console.log(x); // Using declared variable
console.log(y); // Using undeclared variable
`;
// Regular expressions cannot determine if variable is declared
const varDeclaration = /let\s+(\w+)/g;
const varUsage = /console\.log\((\w+)\)/g;
return {
declarations: [...code.matchAll(varDeclaration)].map(m => m[1]),
usages: [...code.matchAll(varUsage)].map(m => m[1]),
note: "Regular expressions cannot check variable scope and declaration status"
};
}
// Alternative solutions
getAlternativeSolutions() {
return {
parsing: {
problem: "Complex syntax parsing",
solution: "Use specialized parser generators",
examples: ["ANTLR", "PEG.js", "Parser combinators"]
},
validation: {
problem: "Complex data validation",
solution: "Use validation libraries and rule engines",
examples: ["Joi", "Yup", "JSON Schema"]
},
textProcessing: {
problem: "Advanced text processing",
solution: "Use specialized text processing libraries",
examples: ["Natural Language Toolkit", "spaCy", "Stanford NLP"]
},
codeAnalysis: {
problem: "Code analysis and refactoring",
solution: "Use Abstract Syntax Tree (AST) tools",
examples: ["Babel", "ESLint", "TypeScript Compiler API"]
}
};
}
}
// Usage example
const limitations = new RegexLimitations();
console.log('Math expression limitation:', limitations.cannotDoMath());
console.log('Context-sensitive limitation:', limitations.cannotHandleContextSensitive());
console.log('Alternative solutions:', limitations.getAlternativeSolutions());
When Not to Use Regular Expressions
class WhenNotToUseRegex {
// 1. Parsing complex structured data
parseComplexData() {
// ❌ Bad: use regex to parse JSON
const badJsonParser = (jsonStr) => {
const stringPattern = /"([^"\\]|\\.)*"/g;
const numberPattern = /-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?/g;
// ... This becomes very complex and unreliable
};
// ✅ Good: use specialized parser
const goodJsonParser = (jsonStr) => {
try {
return JSON.parse(jsonStr);
} catch (error) {
return null;
}
};
return { badJsonParser, goodJsonParser };
}
// 2. Complex business logic validation
validateBusinessLogic() {
// ❌ Bad: use regex to validate complex business rules
const badPasswordValidator = (password) => {
// Trying to validate all rules with one complex regex
const complexRegex = /^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/;
return complexRegex.test(password);
};
// ✅ Good: use step-by-step validation
const goodPasswordValidator = (password) => {
const rules = [
{ test: p => p.length >= 8, message: "At least 8 characters" },
{ test: p => /[a-z]/.test(p), message: "Contains lowercase letter" },
{ test: p => /[A-Z]/.test(p), message: "Contains uppercase letter" },
{ test: p => /\d/.test(p), message: "Contains digit" },
{ test: p => /[@$!%*?&]/.test(p), message: "Contains special character" },
{ test: p => !/(.)\1{2,}/.test(p), message: "No consecutive repeated characters" }
];
const failures = rules.filter(rule => !rule.test(password));
return {
valid: failures.length === 0,
errors: failures.map(f => f.message)
};
};
return { badPasswordValidator, goodPasswordValidator };
}
// Recommended alternatives
getRecommendedAlternatives() {
return {
structuredDataParsing: [
"JSON.parse() for JSON",
"DOMParser for HTML/XML",
"CSV parsing libraries",
"YAML/TOML parsers"
],
complexValidation: [
"Joi validation library",
"Yup schema validation",
"Custom validation functions",
"Rule-based validation engines"
],
codeAnalysis: [
"AST parsers (Babel, TypeScript)",
"ESLint rules",
"Language servers",
"Static analysis tools"
],
textProcessing: [
"String methods for simple cases",
"Specialized NLP libraries",
"Tokenization libraries",
"Search engines (Elasticsearch, Solr)"
],
performanceCritical: [
"Native string methods",
"Streaming processors",
"Compiled regular expressions",
"Finite state machines"
]
};
}
}
const examples = new WhenNotToUseRegex();
console.log('Alternatives to regular expressions:', examples.getRecommendedAlternatives());
Summary
This chapter explored advanced topics and extended knowledge of regular expressions:
-
Unicode and Multi-language Processing:
- Unicode property matching
- Multi-language text analysis
- Text normalization and tokenization
- International text processing
-
Recursive Regular Expressions:
- Recursive matching concepts
- Alternative implementations in JavaScript
- Balanced structure matching
- Nested content processing
-
Advanced Replacement Techniques:
- Template replacement
- Conditional replacement
- Context-aware replacement
- Smart formatting
-
Limitations of Regular Expressions:
- Types of problems that cannot be handled
- Performance pitfalls
- When not to use regular expressions
- Alternative solutions
-
Best Practice Recommendations:
- Choosing the right tool
- Balancing complexity and maintainability
- Performance optimization considerations
- Error handling and user experience
Understanding these advanced topics and limitations helps us use regular expressions more wisely, maximize their value in appropriate scenarios, while avoiding unnecessary complexity and performance issues.
Course Summary
Through these 12 chapters, we have comprehensively mastered:
- Basic syntax and concepts of regular expressions
- Character matching and pattern building
- Advanced features and techniques
- Practical applications and best practices
- Performance optimization and alternatives
Regular expressions are powerful text processing tools, but remember: the value of a tool lies in solving problems, not in demonstrating complexity. Choose the tool that best fits the problem and write readable, performant code—this is the mark of a professional developer.