encoder/encode-set.js

/**
 * @fileoverview
 * An EncodeSet represents a set of characters that should be percent-encoded.
 *
 * Different characters need to be encoded in different parts of an URL.
 * For example, a literal ? question mark in an URL’s path would indicate the
 * start of the query string. A question mark meant to be part of the path
 * therefore needs to be percent-encoded.
 * In the query string however, a question mark does not have any special
 * meaning and does not need to be percent-encoded.
 *
 * A few sets are defined in this module.
 * Use the {@link EncodeSet} class to define different ones.
 *
 * @see {@link https://url.spec.whatwg.org/#simple-encode-set}
 */

/**
 * A character (String), or character code (Number).
 *
 * @typedef {String|Number} Char
 */

/**
 * A Set or Array of {@link Char}(s).
 *
 * @typedef {Set.<Char>|Array.<Char>} CharSet
 */

const QUERY_ENCODE_CHARS = [' ', '"', '#', '\'', '<', '>'],
    FRAGMENT_EXTEND_CHARS = [' ', '"', '<', '>', '`'],
    PATH_EXTEND_CHARS = ['#', '?', '{', '}'],
    USERINFO_EXTEND_CHARS = ['/', ':', ';', '=', '@', '[', '\\', ']', '^', '|'];

/**
 * Returns a number representing the UTF-16 code unit value of the character.
 *
 * @private
 * @param {Char} char Character or character code
 * @returns {Number} Character code
 */
function charCode (char) {
    const code = (typeof char === 'string') ?
        // get char code from string
        char.charCodeAt(0) :
        // or, normalize char code using double Bitwise NOT
        // Refer: https://jsperf.com/truncating-decimals
        ~~char;

    // ensure UTF-16 range [0, 0xFFFF]
    return (code >= 0 && code <= 0xFFFF) ? code : 0;
}

/**
 * Extends the EncodeSet with the given characters.
 *
 * @note Mutates the input EncodeSet.
 *
 * @private
 * @param {EncodeSet} encodeSet Instance of EncodeSet
 * @param {CharSet} chars Character set to extend
 * @returns {EncodeSet} Given EncodeSet
 */
function extendEncodeSet (encodeSet, chars) {
    // special handling for Uint8Array chars which signify an existing encode
    // set used to extend the given encodeSet.
    if (chars instanceof Uint8Array) {
        // iterate over fixed / known size set
        encodeSet._set.forEach((encoded, index) => {
            if (!encoded && chars[index]) {
                // encode charCodeAt(index)
                encodeSet._set[index] = 1;
            }
        });

        return encodeSet;
    }

    // check if the input characters are iterable or not
    if (!(chars && typeof chars.forEach === 'function')) {
        return encodeSet;
    }

    chars.forEach((char) => {
        encodeSet.add(char);
    });

    return encodeSet;
}

/**
 * Represents a set of characters / bytes that should be percent-encoded.
 */
class EncodeSet {
    /**
     * @param {CharSet} chars Character set to encode
     */
    constructor (chars) {
        /**
         * Indexes in Uint8Array represents char codes for characters to encode.
         *
         * Size: 128, ASCII range [0, 0x7F]
         *
         * where,
         * 1 -> encode
         * 0 -> don't encode
         *
         * @private
         * @type {Uint8Array}
         */
        this._set = new Uint8Array(0x80);

        // encode C0 control codes [00, 0x1F] AND 0x7F
        this._set.fill(1, 0, 0x20); // 0 to 31
        this._set[0x7F] = 1; // 127

        /**
         * A Boolean indicating whether or not this EncodeSet is sealed.
         *
         * @private
         * @type {Boolean}
         */
        this._sealed = false;

        // extend this set with input characters
        extendEncodeSet(this, chars);
    }

    /**
     * Appends a new character to the EncodeSet.
     *
     * @example
     * var xyzEncodeSet = new EncodeSet(['x', 'y', 'z'])
     *
     * xyzEncodeSet
     *  .add('X')
     *  .add(89) // Y
     *  .add(0x5a) // Z
     *
     * @param {Char} char Character or character code
     * @returns {EncodeSet} Current EncodeSet
     */
    add (char) {
        // bail out if the EncodeSet is sealed
        if (this._sealed) {
            return this;
        }

        const code = charCode(char);

        // ensure ASCII range
        if (code < 0x80) {
            this._set[code] = 1;
        }

        // chaining
        return this;
    }

    /**
     * Returns a boolean asserting whether the given char code will be encoded in
     * the EncodeSet or not.
     *
     * @note Always encode C0 control codes in the range U+0000 to U+001F and U+007F
     * Refer: https://infra.spec.whatwg.org/#c0-control
     *
     * @example
     * var tildeEncodeSet = new EncodeSet(['~'])
     *
     * // returns true
     * tildeEncodeSet.has('~'.charCodeAt(0))
     *
     * // returns false
     * tildeEncodeSet.has(65) // A
     *
     * // returns true
     * tildeEncodeSet.has(31) // \u001f (control character)
     *
     * @param {Number} code Character code
     * @returns {Boolean} Returns true if the character with the specified char code
     * exists in the EncodeSet; otherwise false
     */
    has (code) {
        // encode if not in ASCII range (-∞, 0) OR (127, ∞)
        if (code < 0 || code > 0x7F) {
            return true;
        }

        // encode if present in the set
        return Boolean(this._set[code]);
    }

    /**
     * Creates a copy of the current EncodeSet.
     *
     * @example
     * var set1 = new EncodeSet(['<', '>'])
     * var set1Copy = set1.clone().add('=')
     *
     * @returns {EncodeSet} New EncodeSet instance
     */
    clone () {
        return new EncodeSet(this._set);
    }

    /**
     * Seals the current EncodeSet to prevent new characters being added to it.
     *
     * @example
     * var set = new EncodeSet()
     *
     * set.add(95)
     * set.has(95) // returns true
     *
     * set.seal()
     * set.add(100)
     * set.has(100) // returns false
     *
     * @returns {EncodeSet} Current EncodeSet
     */
    seal () {
        this._sealed = true;

        try {
            // @note Cannot freeze array buffer views with elements.
            // So, rely upon the alternative `Object.seal` method and avoid mutations
            // via EncodeSet~add method.
            // Also, sealed Uint8Array enumerates faster in V8!
            Object.seal(this._set);
        }
        catch (_) {
            // silently swallow exceptions
        }

        return this;
    }

    /**
     * Creates a new EncodeSet by extending the input EncodeSet with additional
     * characters.
     *
     * @example
     * var fooEncodeSet = new EncodeSet(['f', 'o'])
     * var foobarEncodeSet = EncodeSet.extend(fooEncodeSet, new Set(['b', 'a', 'r']))
     *
     * @param {EncodeSet} encodeSet Instance of EncodeSet
     * @param {CharSet} chars Character set to encode
     * @returns {EncodeSet} Copy of given `encodeSet` with extended `chars`
     * @throws {TypeError} Argument `encodeSet` must be of type {@link EncodeSet}
     */
    static extend (encodeSet, chars) {
        if (!EncodeSet.isEncodeSet(encodeSet)) {
            throw new TypeError('Argument `encodeSet` must be EncodeSet');
        }

        // extend the cloned encodeSet to avoid mutations
        return extendEncodeSet(encodeSet.clone(), chars);
    }

    /**
     * Determines whether the input value is an EncodeSet or not.
     *
     * @example
     * // returns true
     * EncodeSet.isEncodeSet(new EncodeSet([40, 41]))
     *
     * // returns false
     * EncodeSet.isEncodeSet(new Set([28, 05]))
     *
     * @param {*} value The value to be tested
     * @returns {Boolean} true if the given value is an EncodeSet; otherwise, false
     */
    static isEncodeSet (value) {
        return Boolean(value) && (value instanceof EncodeSet);
    }
}

const // eslint-disable-line one-var

    /**
     * The C0 control percent-encode set are the C0 controls and all code points
     * greater than U+007E (~).
     *
     * @const
     * @type {EncodeSet}
     * @see {@link https://url.spec.whatwg.org/#c0-control-percent-encode-set}
     */
    C0_CONTROL_ENCODE_SET = new EncodeSet().seal(),

    /**
     * The fragment percent-encode set is the C0 control percent-encode set and
     * U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).
     *
     * @const
     * @type {EncodeSet}
     * @see {@link https://url.spec.whatwg.org/#fragment-percent-encode-set}
     */
    FRAGMENT_ENCODE_SET = EncodeSet.extend(C0_CONTROL_ENCODE_SET, FRAGMENT_EXTEND_CHARS).seal(),

    /**
     * The path percent-encode set is the fragment percent-encode set and
     * U+0023 (#), U+003F (?), U+007B ({), and U+007D (}).
     *
     * @const
     * @type {EncodeSet}
     * @see {@link https://url.spec.whatwg.org/#path-percent-encode-set}
     */
    PATH_ENCODE_SET = EncodeSet.extend(FRAGMENT_ENCODE_SET, PATH_EXTEND_CHARS).seal(),

    /**
     * The userinfo percent-encode set is the path percent-encode set and
     * U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@), U+005B ([),
     * U+005C (\), U+005D (]), U+005E (^), and U+007C (|).
     *
     * @const
     * @type {EncodeSet}
     * @see {@link https://url.spec.whatwg.org/#userinfo-percent-encode-set}
     */
    USERINFO_ENCODE_SET = EncodeSet.extend(PATH_ENCODE_SET, USERINFO_EXTEND_CHARS).seal(),

    /**
     * The query percent-encode set is the C0 control percent-encode set and
     * U+0020 SPACE, U+0022 ("), U+0023 (#), U+0027 ('), U+003C (<), and U+003E (>).
     *
     * @const
     * @type {EncodeSet}
     * @see {@link https://url.spec.whatwg.org/#query-state}
     */
    QUERY_ENCODE_SET = new EncodeSet(QUERY_ENCODE_CHARS).seal();

module.exports = {
    // EncodeSet class
    EncodeSet,

    // Constants
    PATH_ENCODE_SET,
    QUERY_ENCODE_SET,
    FRAGMENT_ENCODE_SET,
    USERINFO_ENCODE_SET,
    C0_CONTROL_ENCODE_SET
};