You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

339 lines
9.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

'use strict';
const generate = require('regjsgen').generate;
const parse = require('regjsparser').parse;
const regenerate = require('regenerate');
const unicodeMatchProperty = require('unicode-match-property-ecmascript');
const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
const iuMappings = require('./data/iu-mappings.js');
const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
// Prepare a Regenerate set containing all code points, used for negative
// character classes (if any).
const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
// Without the `u` flag, the range stops at 0xFFFF.
// https://mths.be/es6#sec-pattern-semantics
const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
// Prepare a Regenerate set containing all code points that are supposed to be
// matched by `/./u`. https://mths.be/es6#sec-atom
const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
.remove(
// minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
0x000A, // Line Feed <LF>
0x000D, // Carriage Return <CR>
0x2028, // Line Separator <LS>
0x2029 // Paragraph Separator <PS>
);
const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
if (unicode) {
if (ignoreCase) {
return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
}
return ESCAPE_SETS.UNICODE.get(character);
}
return ESCAPE_SETS.REGULAR.get(character);
};
const getUnicodeDotSet = (dotAll) => {
return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
};
const getUnicodePropertyValueSet = (property, value) => {
const path = value ?
`${ property }/${ value }` :
`Binary_Property/${ property }`;
try {
return require(`regenerate-unicode-properties/${ path }.js`);
} catch (exception) {
throw new Error(
`Failed to recognize value \`${ value }\` for property ` +
`\`${ property }\`.`
);
}
};
const handleLoneUnicodePropertyNameOrValue = (value) => {
// It could be a `General_Category` value or a binary property.
// Note: `unicodeMatchPropertyValue` throws on invalid values.
try {
const property = 'General_Category';
const category = unicodeMatchPropertyValue(property, value);
return getUnicodePropertyValueSet(property, category);
} catch (exception) {}
// Its not a `General_Category` value, so check if its a binary
// property. Note: `unicodeMatchProperty` throws on invalid properties.
const property = unicodeMatchProperty(value);
return getUnicodePropertyValueSet(property);
};
const getUnicodePropertyEscapeSet = (value, isNegative) => {
const parts = value.split('=');
const firstPart = parts[0];
let set;
if (parts.length == 1) {
set = handleLoneUnicodePropertyNameOrValue(firstPart);
} else {
// The pattern consists of two parts, i.e. `Property=Value`.
const property = unicodeMatchProperty(firstPart);
const value = unicodeMatchPropertyValue(property, parts[1]);
set = getUnicodePropertyValueSet(property, value);
}
if (isNegative) {
return UNICODE_SET.clone().remove(set);
}
return set.clone();
};
// Given a range of code points, add any case-folded code points in that range
// to a set.
regenerate.prototype.iuAddRange = function(min, max) {
const $this = this;
do {
const folded = caseFold(min);
if (folded) {
$this.add(folded);
}
} while (++min <= max);
return $this;
};
const update = (item, pattern) => {
let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
switch (tree.type) {
case 'characterClass':
case 'group':
case 'value':
// No wrapping needed.
break;
default:
// Wrap the pattern in a non-capturing group.
tree = wrap(tree, pattern);
}
Object.assign(item, tree);
};
const wrap = (tree, pattern) => {
// Wrap the pattern in a non-capturing group.
return {
'type': 'group',
'behavior': 'ignore',
'body': [tree],
'raw': `(?:${ pattern })`
};
};
const caseFold = (codePoint) => {
return iuMappings.get(codePoint) || false;
};
const processCharacterClass = (characterClassItem, regenerateOptions) => {
let set = regenerate();
for (const item of characterClassItem.body) {
switch (item.type) {
case 'value':
set.add(item.codePoint);
if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
const folded = caseFold(item.codePoint);
if (folded) {
set.add(folded);
}
}
break;
case 'characterClassRange':
const min = item.min.codePoint;
const max = item.max.codePoint;
set.addRange(min, max);
if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
set.iuAddRange(min, max);
}
break;
case 'characterClassEscape':
set.add(getCharacterClassEscapeSet(
item.value,
config.unicode,
config.ignoreCase
));
break;
case 'unicodePropertyEscape':
set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
}
if (characterClassItem.negative) {
set = (config.unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
}
update(characterClassItem, set.toString(regenerateOptions));
return characterClassItem;
};
const updateNamedReference = (item, index) => {
delete item.name;
item.matchIndex = index;
};
const assertNoUnmatchedReferences = (groups) => {
const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
if (unmatchedReferencesNames.length > 0) {
throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
}
};
const processTerm = (item, regenerateOptions, groups) => {
switch (item.type) {
case 'dot':
if (config.unicode) {
update(
item,
getUnicodeDotSet(config.dotAll).toString(regenerateOptions)
);
} else if (config.dotAll) {
// TODO: consider changing this at the regenerate level.
update(item, '[\\s\\S]');
}
break;
case 'characterClass':
item = processCharacterClass(item, regenerateOptions);
break;
case 'unicodePropertyEscape':
if (config.unicodePropertyEscape) {
update(
item,
getUnicodePropertyEscapeSet(item.value, item.negative)
.toString(regenerateOptions)
);
}
break;
case 'characterClassEscape':
update(
item,
getCharacterClassEscapeSet(
item.value,
config.unicode,
config.ignoreCase
).toString(regenerateOptions)
);
break;
case 'group':
if (item.behavior == 'normal') {
groups.lastIndex++;
}
if (item.name && config.namedGroup) {
const name = item.name.value;
if (groups.names[name]) {
throw new Error(
`Multiple groups with the same name (${ name }) are not allowed.`
);
}
const index = groups.lastIndex;
delete item.name;
groups.names[name] = index;
if (groups.onNamedGroup) {
groups.onNamedGroup.call(null, name, index);
}
if (groups.unmatchedReferences[name]) {
groups.unmatchedReferences[name].forEach(reference => {
updateNamedReference(reference, index);
});
delete groups.unmatchedReferences[name];
}
}
/* falls through */
case 'alternative':
case 'disjunction':
case 'quantifier':
item.body = item.body.map(term => {
return processTerm(term, regenerateOptions, groups);
});
break;
case 'value':
const codePoint = item.codePoint;
const set = regenerate(codePoint);
if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
const folded = caseFold(codePoint);
if (folded) {
set.add(folded);
}
}
update(item, set.toString(regenerateOptions));
break;
case 'reference':
if (item.name) {
const name = item.name.value;
const index = groups.names[name];
if (index) {
updateNamedReference(item, index);
break;
}
if (!groups.unmatchedReferences[name]) {
groups.unmatchedReferences[name] = [];
}
// Keep track of references used before the corresponding group.
groups.unmatchedReferences[name].push(item);
}
break;
case 'anchor':
case 'empty':
case 'group':
// Nothing to do here.
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
return item;
};
const config = {
'ignoreCase': false,
'unicode': false,
'dotAll': false,
'useUnicodeFlag': false,
'unicodePropertyEscape': false,
'namedGroup': false
};
const rewritePattern = (pattern, flags, options) => {
config.unicode = flags && flags.includes('u');
const regjsparserFeatures = {
'unicodePropertyEscape': config.unicode,
'namedGroups': true,
'lookbehind': options && options.lookbehind
};
config.ignoreCase = flags && flags.includes('i');
const supportDotAllFlag = options && options.dotAllFlag;
config.dotAll = supportDotAllFlag && flags && flags.includes('s');
config.namedGroup = options && options.namedGroup;
config.useUnicodeFlag = options && options.useUnicodeFlag;
config.unicodePropertyEscape = options && options.unicodePropertyEscape;
const regenerateOptions = {
'hasUnicodeFlag': config.useUnicodeFlag,
'bmpOnly': !config.unicode
};
const groups = {
'onNamedGroup': options && options.onNamedGroup,
'lastIndex': 0,
'names': Object.create(null), // { [name]: index }
'unmatchedReferences': Object.create(null) // { [name]: Array<reference> }
};
const tree = parse(pattern, flags, regjsparserFeatures);
// Note: `processTerm` mutates `tree` and `groups`.
processTerm(tree, regenerateOptions, groups);
assertNoUnmatchedReferences(groups);
return generate(tree);
};
module.exports = rewritePattern;