import { syntaxError } from '../error/syntaxError.mjs'; import { Token } from './ast.mjs'; import { dedentBlockStringLines } from './blockString.mjs'; import { isDigit, isNameContinue, isNameStart } from './characterClasses.mjs'; import { TokenKind } from './tokenKind.mjs'; /** * Given a Source object, creates a Lexer for that source. * A Lexer is a stateful stream generator in that every time * it is advanced, it returns the next token in the Source. Assuming the * source lexes, the final Token emitted by the lexer will be of kind * EOF, after which the lexer will repeatedly return the same EOF token * whenever called. */ export class Lexer { /** * The previously focused non-ignored token. */ /** * The currently focused non-ignored token. */ /** * The (1-indexed) line containing the current token. */ /** * The character offset at which the current line begins. */ constructor(source) { const startOfFileToken = new Token(TokenKind.SOF, 0, 0, 0, 0); this.source = source; this.lastToken = startOfFileToken; this.token = startOfFileToken; this.line = 1; this.lineStart = 0; } get [Symbol.toStringTag]() { return 'Lexer'; } /** * Advances the token stream to the next non-ignored token. */ advance() { this.lastToken = this.token; const token = (this.token = this.lookahead()); return token; } /** * Looks ahead and returns the next non-ignored token, but does not change * the state of Lexer. */ lookahead() { let token = this.token; if (token.kind !== TokenKind.EOF) { do { if (token.next) { token = token.next; } else { // Read the next token and form a link in the token linked-list. const nextToken = readNextToken(this, token.end); // @ts-expect-error next is only mutable during parsing. token.next = nextToken; // @ts-expect-error prev is only mutable during parsing. nextToken.prev = token; token = nextToken; } } while (token.kind === TokenKind.COMMENT); } return token; } } /** * @internal */ export function isPunctuatorTokenKind(kind) { return ( kind === TokenKind.BANG || kind === TokenKind.DOLLAR || kind === TokenKind.AMP || kind === TokenKind.PAREN_L || kind === TokenKind.PAREN_R || kind === TokenKind.SPREAD || kind === TokenKind.COLON || kind === TokenKind.EQUALS || kind === TokenKind.AT || kind === TokenKind.BRACKET_L || kind === TokenKind.BRACKET_R || kind === TokenKind.BRACE_L || kind === TokenKind.PIPE || kind === TokenKind.BRACE_R ); } /** * A Unicode scalar value is any Unicode code point except surrogate code * points. In other words, the inclusive ranges of values 0x0000 to 0xD7FF and * 0xE000 to 0x10FFFF. * * SourceCharacter :: * - "Any Unicode scalar value" */ function isUnicodeScalarValue(code) { return ( (code >= 0x0000 && code <= 0xd7ff) || (code >= 0xe000 && code <= 0x10ffff) ); } /** * The GraphQL specification defines source text as a sequence of unicode scalar * values (which Unicode defines to exclude surrogate code points). However * JavaScript defines strings as a sequence of UTF-16 code units which may * include surrogates. A surrogate pair is a valid source character as it * encodes a supplementary code point (above U+FFFF), but unpaired surrogate * code points are not valid source characters. */ function isSupplementaryCodePoint(body, location) { return ( isLeadingSurrogate(body.charCodeAt(location)) && isTrailingSurrogate(body.charCodeAt(location + 1)) ); } function isLeadingSurrogate(code) { return code >= 0xd800 && code <= 0xdbff; } function isTrailingSurrogate(code) { return code >= 0xdc00 && code <= 0xdfff; } /** * Prints the code point (or end of file reference) at a given location in a * source for use in error messages. * * Printable ASCII is printed quoted, while other points are printed in Unicode * code point form (ie. U+1234). */ function printCodePointAt(lexer, location) { const code = lexer.source.body.codePointAt(location); if (code === undefined) { return TokenKind.EOF; } else if (code >= 0x0020 && code <= 0x007e) { // Printable ASCII const char = String.fromCodePoint(code); return char === '"' ? "'\"'" : `"${char}"`; } // Unicode code point return 'U+' + code.toString(16).toUpperCase().padStart(4, '0'); } /** * Create a token with line and column location information. */ function createToken(lexer, kind, start, end, value) { const line = lexer.line; const col = 1 + start - lexer.lineStart; return new Token(kind, start, end, line, col, value); } /** * Gets the next token from the source starting at the given position. * * This skips over whitespace until it finds the next lexable token, then lexes * punctuators immediately or calls the appropriate helper function for more * complicated tokens. */ function readNextToken(lexer, start) { const body = lexer.source.body; const bodyLength = body.length; let position = start; while (position < bodyLength) { const code = body.charCodeAt(position); // SourceCharacter switch (code) { // Ignored :: // - UnicodeBOM // - WhiteSpace // - LineTerminator // - Comment // - Comma // // UnicodeBOM :: "Byte Order Mark (U+FEFF)" // // WhiteSpace :: // - "Horizontal Tab (U+0009)" // - "Space (U+0020)" // // Comma :: , case 0xfeff: // case 0x0009: // \t case 0x0020: // case 0x002c: // , ++position; continue; // LineTerminator :: // - "New Line (U+000A)" // - "Carriage Return (U+000D)" [lookahead != "New Line (U+000A)"] // - "Carriage Return (U+000D)" "New Line (U+000A)" case 0x000a: // \n ++position; ++lexer.line; lexer.lineStart = position; continue; case 0x000d: // \r if (body.charCodeAt(position + 1) === 0x000a) { position += 2; } else { ++position; } ++lexer.line; lexer.lineStart = position; continue; // Comment case 0x0023: // # return readComment(lexer, position); // Token :: // - Punctuator // - Name // - IntValue // - FloatValue // - StringValue // // Punctuator :: one of ! $ & ( ) ... : = @ [ ] { | } case 0x0021: // ! return createToken(lexer, TokenKind.BANG, position, position + 1); case 0x0024: // $ return createToken(lexer, TokenKind.DOLLAR, position, position + 1); case 0x0026: // & return createToken(lexer, TokenKind.AMP, position, position + 1); case 0x0028: // ( return createToken(lexer, TokenKind.PAREN_L, position, position + 1); case 0x0029: // ) return createToken(lexer, TokenKind.PAREN_R, position, position + 1); case 0x002e: // . if ( body.charCodeAt(position + 1) === 0x002e && body.charCodeAt(position + 2) === 0x002e ) { return createToken(lexer, TokenKind.SPREAD, position, position + 3); } break; case 0x003a: // : return createToken(lexer, TokenKind.COLON, position, position + 1); case 0x003d: // = return createToken(lexer, TokenKind.EQUALS, position, position + 1); case 0x0040: // @ return createToken(lexer, TokenKind.AT, position, position + 1); case 0x005b: // [ return createToken(lexer, TokenKind.BRACKET_L, position, position + 1); case 0x005d: // ] return createToken(lexer, TokenKind.BRACKET_R, position, position + 1); case 0x007b: // { return createToken(lexer, TokenKind.BRACE_L, position, position + 1); case 0x007c: // | return createToken(lexer, TokenKind.PIPE, position, position + 1); case 0x007d: // } return createToken(lexer, TokenKind.BRACE_R, position, position + 1); // StringValue case 0x0022: // " if ( body.charCodeAt(position + 1) === 0x0022 && body.charCodeAt(position + 2) === 0x0022 ) { return readBlockString(lexer, position); } return readString(lexer, position); } // IntValue | FloatValue (Digit | -) if (isDigit(code) || code === 0x002d) { return readNumber(lexer, position, code); } // Name if (isNameStart(code)) { return readName(lexer, position); } throw syntaxError( lexer.source, position, code === 0x0027 ? 'Unexpected single quote character (\'), did you mean to use a double quote (")?' : isUnicodeScalarValue(code) || isSupplementaryCodePoint(body, position) ? `Unexpected character: ${printCodePointAt(lexer, position)}.` : `Invalid character: ${printCodePointAt(lexer, position)}.`, ); } return createToken(lexer, TokenKind.EOF, bodyLength, bodyLength); } /** * Reads a comment token from the source file. * * ``` * Comment :: # CommentChar* [lookahead != CommentChar] * * CommentChar :: SourceCharacter but not LineTerminator * ``` */ function readComment(lexer, start) { const body = lexer.source.body; const bodyLength = body.length; let position = start + 1; while (position < bodyLength) { const code = body.charCodeAt(position); // LineTerminator (\n | \r) if (code === 0x000a || code === 0x000d) { break; } // SourceCharacter if (isUnicodeScalarValue(code)) { ++position; } else if (isSupplementaryCodePoint(body, position)) { position += 2; } else { break; } } return createToken( lexer, TokenKind.COMMENT, start, position, body.slice(start + 1, position), ); } /** * Reads a number token from the source file, either a FloatValue or an IntValue * depending on whether a FractionalPart or ExponentPart is encountered. * * ``` * IntValue :: IntegerPart [lookahead != {Digit, `.`, NameStart}] * * IntegerPart :: * - NegativeSign? 0 * - NegativeSign? NonZeroDigit Digit* * * NegativeSign :: - * * NonZeroDigit :: Digit but not `0` * * FloatValue :: * - IntegerPart FractionalPart ExponentPart [lookahead != {Digit, `.`, NameStart}] * - IntegerPart FractionalPart [lookahead != {Digit, `.`, NameStart}] * - IntegerPart ExponentPart [lookahead != {Digit, `.`, NameStart}] * * FractionalPart :: . Digit+ * * ExponentPart :: ExponentIndicator Sign? Digit+ * * ExponentIndicator :: one of `e` `E` * * Sign :: one of + - * ``` */ function readNumber(lexer, start, firstCode) { const body = lexer.source.body; let position = start; let code = firstCode; let isFloat = false; // NegativeSign (-) if (code === 0x002d) { code = body.charCodeAt(++position); } // Zero (0) if (code === 0x0030) { code = body.charCodeAt(++position); if (isDigit(code)) { throw syntaxError( lexer.source, position, `Invalid number, unexpected digit after 0: ${printCodePointAt( lexer, position, )}.`, ); } } else { position = readDigits(lexer, position, code); code = body.charCodeAt(position); } // Full stop (.) if (code === 0x002e) { isFloat = true; code = body.charCodeAt(++position); position = readDigits(lexer, position, code); code = body.charCodeAt(position); } // E e if (code === 0x0045 || code === 0x0065) { isFloat = true; code = body.charCodeAt(++position); // + - if (code === 0x002b || code === 0x002d) { code = body.charCodeAt(++position); } position = readDigits(lexer, position, code); code = body.charCodeAt(position); } // Numbers cannot be followed by . or NameStart if (code === 0x002e || isNameStart(code)) { throw syntaxError( lexer.source, position, `Invalid number, expected digit but got: ${printCodePointAt( lexer, position, )}.`, ); } return createToken( lexer, isFloat ? TokenKind.FLOAT : TokenKind.INT, start, position, body.slice(start, position), ); } /** * Returns the new position in the source after reading one or more digits. */ function readDigits(lexer, start, firstCode) { if (!isDigit(firstCode)) { throw syntaxError( lexer.source, start, `Invalid number, expected digit but got: ${printCodePointAt( lexer, start, )}.`, ); } const body = lexer.source.body; let position = start + 1; // +1 to skip first firstCode while (isDigit(body.charCodeAt(position))) { ++position; } return position; } /** * Reads a single-quote string token from the source file. * * ``` * StringValue :: * - `""` [lookahead != `"`] * - `"` StringCharacter+ `"` * * StringCharacter :: * - SourceCharacter but not `"` or `\` or LineTerminator * - `\u` EscapedUnicode * - `\` EscapedCharacter * * EscapedUnicode :: * - `{` HexDigit+ `}` * - HexDigit HexDigit HexDigit HexDigit * * EscapedCharacter :: one of `"` `\` `/` `b` `f` `n` `r` `t` * ``` */ function readString(lexer, start) { const body = lexer.source.body; const bodyLength = body.length; let position = start + 1; let chunkStart = position; let value = ''; while (position < bodyLength) { const code = body.charCodeAt(position); // Closing Quote (") if (code === 0x0022) { value += body.slice(chunkStart, position); return createToken(lexer, TokenKind.STRING, start, position + 1, value); } // Escape Sequence (\) if (code === 0x005c) { value += body.slice(chunkStart, position); const escape = body.charCodeAt(position + 1) === 0x0075 // u ? body.charCodeAt(position + 2) === 0x007b // { ? readEscapedUnicodeVariableWidth(lexer, position) : readEscapedUnicodeFixedWidth(lexer, position) : readEscapedCharacter(lexer, position); value += escape.value; position += escape.size; chunkStart = position; continue; } // LineTerminator (\n | \r) if (code === 0x000a || code === 0x000d) { break; } // SourceCharacter if (isUnicodeScalarValue(code)) { ++position; } else if (isSupplementaryCodePoint(body, position)) { position += 2; } else { throw syntaxError( lexer.source, position, `Invalid character within String: ${printCodePointAt( lexer, position, )}.`, ); } } throw syntaxError(lexer.source, position, 'Unterminated string.'); } // The string value and lexed size of an escape sequence. function readEscapedUnicodeVariableWidth(lexer, position) { const body = lexer.source.body; let point = 0; let size = 3; // Cannot be larger than 12 chars (\u{00000000}). while (size < 12) { const code = body.charCodeAt(position + size++); // Closing Brace (}) if (code === 0x007d) { // Must be at least 5 chars (\u{0}) and encode a Unicode scalar value. if (size < 5 || !isUnicodeScalarValue(point)) { break; } return { value: String.fromCodePoint(point), size, }; } // Append this hex digit to the code point. point = (point << 4) | readHexDigit(code); if (point < 0) { break; } } throw syntaxError( lexer.source, position, `Invalid Unicode escape sequence: "${body.slice( position, position + size, )}".`, ); } function readEscapedUnicodeFixedWidth(lexer, position) { const body = lexer.source.body; const code = read16BitHexCode(body, position + 2); if (isUnicodeScalarValue(code)) { return { value: String.fromCodePoint(code), size: 6, }; } // GraphQL allows JSON-style surrogate pair escape sequences, but only when // a valid pair is formed. if (isLeadingSurrogate(code)) { // \u if ( body.charCodeAt(position + 6) === 0x005c && body.charCodeAt(position + 7) === 0x0075 ) { const trailingCode = read16BitHexCode(body, position + 8); if (isTrailingSurrogate(trailingCode)) { // JavaScript defines strings as a sequence of UTF-16 code units and // encodes Unicode code points above U+FFFF using a surrogate pair of // code units. Since this is a surrogate pair escape sequence, just // include both codes into the JavaScript string value. Had JavaScript // not been internally based on UTF-16, then this surrogate pair would // be decoded to retrieve the supplementary code point. return { value: String.fromCodePoint(code, trailingCode), size: 12, }; } } } throw syntaxError( lexer.source, position, `Invalid Unicode escape sequence: "${body.slice(position, position + 6)}".`, ); } /** * Reads four hexadecimal characters and returns the positive integer that 16bit * hexadecimal string represents. For example, "000f" will return 15, and "dead" * will return 57005. * * Returns a negative number if any char was not a valid hexadecimal digit. */ function read16BitHexCode(body, position) { // readHexDigit() returns -1 on error. ORing a negative value with any other // value always produces a negative value. return ( (readHexDigit(body.charCodeAt(position)) << 12) | (readHexDigit(body.charCodeAt(position + 1)) << 8) | (readHexDigit(body.charCodeAt(position + 2)) << 4) | readHexDigit(body.charCodeAt(position + 3)) ); } /** * Reads a hexadecimal character and returns its positive integer value (0-15). * * '0' becomes 0, '9' becomes 9 * 'A' becomes 10, 'F' becomes 15 * 'a' becomes 10, 'f' becomes 15 * * Returns -1 if the provided character code was not a valid hexadecimal digit. * * HexDigit :: one of * - `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` * - `A` `B` `C` `D` `E` `F` * - `a` `b` `c` `d` `e` `f` */ function readHexDigit(code) { return code >= 0x0030 && code <= 0x0039 // 0-9 ? code - 0x0030 : code >= 0x0041 && code <= 0x0046 // A-F ? code - 0x0037 : code >= 0x0061 && code <= 0x0066 // a-f ? code - 0x0057 : -1; } /** * | Escaped Character | Code Point | Character Name | * | ----------------- | ---------- | ---------------------------- | * | `"` | U+0022 | double quote | * | `\` | U+005C | reverse solidus (back slash) | * | `/` | U+002F | solidus (forward slash) | * | `b` | U+0008 | backspace | * | `f` | U+000C | form feed | * | `n` | U+000A | line feed (new line) | * | `r` | U+000D | carriage return | * | `t` | U+0009 | horizontal tab | */ function readEscapedCharacter(lexer, position) { const body = lexer.source.body; const code = body.charCodeAt(position + 1); switch (code) { case 0x0022: // " return { value: '\u0022', size: 2, }; case 0x005c: // \ return { value: '\u005c', size: 2, }; case 0x002f: // / return { value: '\u002f', size: 2, }; case 0x0062: // b return { value: '\u0008', size: 2, }; case 0x0066: // f return { value: '\u000c', size: 2, }; case 0x006e: // n return { value: '\u000a', size: 2, }; case 0x0072: // r return { value: '\u000d', size: 2, }; case 0x0074: // t return { value: '\u0009', size: 2, }; } throw syntaxError( lexer.source, position, `Invalid character escape sequence: "${body.slice( position, position + 2, )}".`, ); } /** * Reads a block string token from the source file. * * ``` * StringValue :: * - `"""` BlockStringCharacter* `"""` * * BlockStringCharacter :: * - SourceCharacter but not `"""` or `\"""` * - `\"""` * ``` */ function readBlockString(lexer, start) { const body = lexer.source.body; const bodyLength = body.length; let lineStart = lexer.lineStart; let position = start + 3; let chunkStart = position; let currentLine = ''; const blockLines = []; while (position < bodyLength) { const code = body.charCodeAt(position); // Closing Triple-Quote (""") if ( code === 0x0022 && body.charCodeAt(position + 1) === 0x0022 && body.charCodeAt(position + 2) === 0x0022 ) { currentLine += body.slice(chunkStart, position); blockLines.push(currentLine); const token = createToken( lexer, TokenKind.BLOCK_STRING, start, position + 3, // Return a string of the lines joined with U+000A. dedentBlockStringLines(blockLines).join('\n'), ); lexer.line += blockLines.length - 1; lexer.lineStart = lineStart; return token; } // Escaped Triple-Quote (\""") if ( code === 0x005c && body.charCodeAt(position + 1) === 0x0022 && body.charCodeAt(position + 2) === 0x0022 && body.charCodeAt(position + 3) === 0x0022 ) { currentLine += body.slice(chunkStart, position); chunkStart = position + 1; // skip only slash position += 4; continue; } // LineTerminator if (code === 0x000a || code === 0x000d) { currentLine += body.slice(chunkStart, position); blockLines.push(currentLine); if (code === 0x000d && body.charCodeAt(position + 1) === 0x000a) { position += 2; } else { ++position; } currentLine = ''; chunkStart = position; lineStart = position; continue; } // SourceCharacter if (isUnicodeScalarValue(code)) { ++position; } else if (isSupplementaryCodePoint(body, position)) { position += 2; } else { throw syntaxError( lexer.source, position, `Invalid character within String: ${printCodePointAt( lexer, position, )}.`, ); } } throw syntaxError(lexer.source, position, 'Unterminated string.'); } /** * Reads an alphanumeric + underscore name from the source. * * ``` * Name :: * - NameStart NameContinue* [lookahead != NameContinue] * ``` */ function readName(lexer, start) { const body = lexer.source.body; const bodyLength = body.length; let position = start + 1; while (position < bodyLength) { const code = body.charCodeAt(position); if (isNameContinue(code)) { ++position; } else { break; } } return createToken( lexer, TokenKind.NAME, start, position, body.slice(start, position), ); }