From 98904fbf661d23d476e9709b5a74fef0d96e4949 Mon Sep 17 00:00:00 2001 From: Knut Sveidqvist Date: Sat, 9 Aug 2025 15:46:30 +0200 Subject: [PATCH] WIP 3 --- .../diagrams/flowchart/parser/flow.grammar | 8 +- .../diagrams/flowchart/parser/flowParser.ts | 439 ++++++++++++++++-- 2 files changed, 402 insertions(+), 45 deletions(-) diff --git a/packages/mermaid/src/diagrams/flowchart/parser/flow.grammar b/packages/mermaid/src/diagrams/flowchart/parser/flow.grammar index 0204093c0..71a0dfa0c 100644 --- a/packages/mermaid/src/diagrams/flowchart/parser/flow.grammar +++ b/packages/mermaid/src/diagrams/flowchart/parser/flow.grammar @@ -120,11 +120,17 @@ InvTrapEnd { invTrapEnd } "<--" $[-]* $[-xo>] | // < + 2+ dashes + ending "--" $[-]* $[-xo>] | // 2+ dashes + ending (includes --> and ---) - // Edge text start patterns - for patterns like A<-- text -->B + // Edge text start patterns - for patterns like A<-- text -->B and A x== text ==x B // These need to be separate from complete arrows to handle edge text properly "<--" | // Left-pointing edge text start (matches START_LINK) "<==" | // Left-pointing thick edge text start "<-." | // Left-pointing dotted edge text start (matches START_DOTTED_LINK) + "x--" | // Cross head open normal start (A x-- text --x B) + "o--" | // Circle head open normal start (A o-- text --o B) + "x==" | // Cross head open thick start (A x== text ==x B) + "o==" | // Circle head open thick start (A o== text ==o B) + "x-." | // Cross head open dotted start (A x-. text .-x B) + "o-." | // Circle head open dotted start (A o-. text .-o B) // Thick arrows - JISON: [xo<]?\=\=+[=xo>] // Optional left head + 2+ equals + right ending diff --git a/packages/mermaid/src/diagrams/flowchart/parser/flowParser.ts b/packages/mermaid/src/diagrams/flowchart/parser/flowParser.ts index 81e7443ee..c5b65b095 100644 --- a/packages/mermaid/src/diagrams/flowchart/parser/flowParser.ts +++ b/packages/mermaid/src/diagrams/flowchart/parser/flowParser.ts @@ -100,6 +100,7 @@ class LezerFlowParser { edgeType: string; edgeStroke: string; } | null = null; // Track last edge for retroactive target chaining + private originalSource = ''; constructor() { this.yy = undefined; @@ -122,6 +123,9 @@ class LezerFlowParser { log.debug('UIO Parsing flowchart with Lezer:', newSrc); + // Keep a copy of the original source for substring extraction + this.originalSource = newSrc; + // Parse with Lezer const tree = lezerParser.parse(newSrc); @@ -169,6 +173,15 @@ class LezerFlowParser { const processedTokens: { type: string; value: string; from: number; to: number }[] = []; let i = 0; + // Helper: detect head-open tokens like x--, o--, x==, o==, x-., o-. + const isHeadOpenToken = (val: string) => + val === 'x--' || + val === 'o--' || + val === 'x==' || + val === 'o==' || + val === 'x-.' || + val === 'o-.'; + while (i < tokens.length) { const token = tokens[i]; @@ -179,6 +192,31 @@ class LezerFlowParser { continue; } + // Convert NODE_STRING head-open tokens (x--, o--, x==, o==, x-., o-.) into LINK when used as arrow openers + if (token.type === 'NODE_STRING' && isHeadOpenToken(token.value)) { + // Require a plausible source node immediately before in the processed stream + const prev = processedTokens[processedTokens.length - 1]; + // Look ahead for a closing LINK that ends with matching head (x/o) + const head = token.value[0]; // 'x' or 'o' + let hasClosingTail = false; + for (let j = i + 1; j < Math.min(tokens.length, i + 6); j++) { + const t = tokens[j]; + if (t.type === 'LINK' && (t.value.endsWith(head) || t.value.endsWith('>'))) { + hasClosingTail = true; + break; + } + } + if (prev && (prev.type === 'Identifier' || prev.type === 'NODE_STRING') && hasClosingTail) { + const converted = { ...token, type: 'LINK' }; + console.log( + `UIO DEBUG: Converted head-open token ${token.value} to LINK for double-ended arrow` + ); + processedTokens.push(converted); + i++; + continue; + } + } + // Try to detect fragmented edge patterns const mergedPattern = this.tryMergeFragmentedEdgePattern(tokens, i); if (mergedPattern) { @@ -213,25 +251,8 @@ class LezerFlowParser { // 2. A-- text including URL space and send -->B // 3. A---|text|B (pipe-delimited) - // Check for simple edge pattern first (A---B, A--xB, etc.) - // But only if it's not part of a pipe-delimited pattern - if ( - this.isSimpleEdgePattern(tokens[startIndex]) && - !this.isPartOfPipeDelimitedPattern(tokens, startIndex) - ) { - const patternTokens = [tokens[startIndex]]; - console.log( - `UIO DEBUG: Analyzing simple edge pattern: ${patternTokens.map((t) => t.value).join(' ')}` - ); - - const merged = this.detectAndMergeEdgePattern(patternTokens, tokens, startIndex); - if (merged) { - return { - mergedTokens: merged, - nextIndex: startIndex + 1, - }; - } - } + // Defer simple one-token edge merging until after checking for pipe or text-between-arrows + // This ensures patterns like A--text ... -->B are handled as text, not as A -- text. // Check for pipe-delimited pattern (A---|text|B) if (this.isPipeDelimitedEdgePattern(tokens, startIndex)) { @@ -282,6 +303,20 @@ class LezerFlowParser { return null; // Not a complex edge pattern } + // Special handling: if this looks like A--text ... -->B or A-- text ... -->B, + // fall back to Pattern1/Pattern2 detection so we retain the text. + // This helps edge text without pipes. + { + const slice = tokens.slice(startIndex, endIndex); + const merged = this.detectAndMergeEdgePattern(slice, tokens, startIndex); + if (merged) { + return { + mergedTokens: merged, + nextIndex: endIndex, + } as any; // Will be handled by caller above + } + } + // Extract the tokens that form this edge pattern const patternTokens = tokens.slice(startIndex, endIndex); console.log( @@ -1038,18 +1073,34 @@ class LezerFlowParser { case 'RectStart': case 'TrapStart': case 'InvTrapStart': - case 'TagEnd': // Odd shape start ('>text]') - // Handle orphaned shape tokens (shape tokens without preceding node ID) - // Check if we have a pending shaped target ID from an embedded arrow edge + case 'TagEnd': // Odd shape start ('>text]') or split-arrow head ('>') + // Priority 1: If we have a pending shaped target from an embedded arrow, consume as shaped node now if (this.pendingShapedTargetId) { console.log( `UIO DEBUG: Applying shape to pending target node: ${this.pendingShapedTargetId}` ); i = this.parseShapedNodeForTarget(tokens, i, this.pendingShapedTargetId); this.pendingShapedTargetId = null; // Clear the pending target - } else { - i = this.parseStatement(tokens, i); + break; } + + // Priority 2: Orphaned shape token for the last referenced node (e.g., A-->B>text]) + if (this.isShapeStart(token) && this.lastReferencedNodeId) { + console.log( + `UIO DEBUG: Detected orphaned shape token '${token.type}:${token.value}' for lastReferencedNodeId=${this.lastReferencedNodeId}` + ); + i = this.parseOrphanedShapeStatement(tokens, i); + break; + } + + // Priority 3: Continuation edge head (e.g., A-->B-->C) + if (token.type === 'TagEnd' && token.value === '>' && this.lastTargetNodes.length > 0) { + i = this.parseContinuationEdgeStatement(tokens, i); + break; + } + + // Fallback: Delegate to parseStatement + i = this.parseStatement(tokens, i); break; case 'CLICK': i = this.parseClickStatement(tokens, i); @@ -1185,6 +1236,19 @@ class LezerFlowParser { lookahead.map((t) => `${t.type}:${t.value}`) ); + // Accessibility statements: accTitle / accDescr + if ( + lookahead.length >= 1 && + lookahead[0].type === 'NODE_STRING' && + (lookahead[0].value === 'accTitle' || lookahead[0].value === 'accDescr') + ) { + if (lookahead[0].value === 'accTitle') { + return this.parseAccTitleStatement(tokens, i); + } else { + return this.parseAccDescrStatement(tokens, i); + } + } + // Check if this is a direction statement (direction BT) if ( lookahead.length >= 2 && @@ -1281,7 +1345,7 @@ class LezerFlowParser { // Check if this is an edge (A --> B pattern or A(text) --> B pattern) // Check for orphaned shape tokens (shape tokens without preceding node ID) FIRST // This happens when an edge creates a target node but leaves the shape tokens for later processing - if (lookahead.length >= 3 && this.isShapeStart(lookahead[0].type)) { + if (lookahead.length >= 3 && this.isShapeStart(lookahead[0])) { console.log(`UIO DEBUG: Taking orphaned shape statement path (shape without node ID)`); return this.parseOrphanedShapeStatement(tokens, i); } @@ -1633,11 +1697,14 @@ class LezerFlowParser { } /** - * Check if a token type represents a shape start delimiter - * @param tokenType - The token type to check - * @returns True if it's a shape start delimiter + * Check if a token represents a shape start delimiter + * Accepts either a token object or a token type string for backward compatibility */ - private isShapeStart(tokenType: string): boolean { + private isShapeStart(tokenOrType: { type: string; value: string } | string): boolean { + const type = typeof tokenOrType === 'string' ? tokenOrType : tokenOrType.type; + const val = typeof tokenOrType === 'string' ? '' : tokenOrType.value; + + // Base shape starts by token type const shapeStarts = [ 'SquareStart', // [ 'ParenStart', // ( @@ -1650,7 +1717,18 @@ class LezerFlowParser { 'InvTrapStart', // [\ 'TagEnd', // > (for odd shapes) ]; - return shapeStarts.includes(tokenType); + + if (shapeStarts.includes(type)) { + return true; + } + + // Some punctuation comes through as generic '⚠' tokens in the lexer + // Treat '⚠' with value '>' as an odd-shape start + if (type === '⚠' && val === '>') { + return true; + } + + return false; } /** @@ -1775,21 +1853,103 @@ class LezerFlowParser { } } + // Track string parsing state inside shape text + let inString = false; + let stringQuote: '"' | "'" | null = null; + let seenStr = false; // saw a single quoted string token as entire text + const sawEllipseCloseHyphen = false; // for ellipse (-text-) + // Collect all tokens until we find any valid shape end delimiter while (i < tokens.length && !possibleEndTokens.includes(tokens[i].type)) { + const tk = tokens[i]; + + // If we get a complete quoted string token (STR), allow it only if it's the only content + if (tk.type === 'STR') { + if (shapeText.trim().length > 0 || seenStr) { + throw new Error("got 'STR'"); + } + shapeText += tk.value; // keep quotes; processNodeText will strip and classify + seenStr = true; + i++; + continue; + } + // For ellipse shapes, stop when we encounter the closing hyphen - if (actualShapeType === 'EllipseStart' && tokens[i].type === 'Hyphen') { + if (actualShapeType === 'EllipseStart' && tk.type === 'Hyphen') { break; // This is the closing hyphen, don't include it in the text } + // If a full STR was consumed as the only text, parentheses should trigger SQE (legacy) + if ( + seenStr && + (tk.type === 'ParenStart' || tk.type === 'ParenEnd' || tk.value === '(' || tk.value === ')') + ) { + throw new Error("Expecting 'SQE'"); + } + + // Quote handling - mirror legacy JISON error behavior + const isQuoteToken = + tk.type === 'STR' || + tk.type === 'SQS' || + tk.type === 'SQE' || + tk.type === 'DQS' || + tk.type === 'DQE' || + (tk.type === '⚠' && (tk.value === '"' || tk.value === "'")); + + if (isQuoteToken) { + const quoteChar: '"' | "'" = tk.value === "'" ? "'" : '"'; + + if (!inString) { + // If there is already plain text before a quote, error: mixing text and string + if (shapeText.trim().length > 0) { + throw new Error("got 'STR'"); + } + // Enter string mode; do not include quote char itself in text + inString = true; + stringQuote = quoteChar; + i++; + continue; + } else { + // Already inside a string + if (stringQuote === quoteChar) { + // Closing the string + inString = false; + stringQuote = null; + i++; + continue; + } else { + // Nested/mismatched quote inside string + throw new Error("Expecting 'SQE'"); + } + } + } + + // If inside a string, any parentheses should trigger the SQE error (unterminated string expected) + if ( + inString && + (tk.type === 'ParenStart' || tk.type === 'ParenEnd' || tk.value === '(' || tk.value === ')') + ) { + throw new Error("Expecting 'SQE'"); + } + + // In square/rect shapes, parentheses are not allowed within text (legacy behavior) + if ((actualShapeType === 'SquareStart' || actualShapeType === 'RectStart') && !inString) { + if (tk.type === 'ParenStart' || tk.value === '(') { + throw new Error("got 'PS'"); + } + if (tk.type === 'ParenEnd' || tk.value === ')') { + throw new Error("got 'PE'"); + } + } + // Note: We don't stop for statement keywords when inside shape delimiters // Keywords like 'linkStyle', 'classDef', etc. should be treated as regular text // when they appear inside shapes like [linkStyle] or (classDef) // Check for HTML tag pattern: < + tag_name + > if ( - tokens[i].type === '⚠' && - tokens[i].value === '<' && + tk.type === '⚠' && + tk.value === '<' && i + 2 < tokens.length && !possibleEndTokens.includes(tokens[i + 1].type) ) { @@ -1803,7 +1963,7 @@ class LezerFlowParser { // Preserve original spacing before HTML tag if (shapeText && i > startIndex + 1) { const prevToken = tokens[i - 1]; - const currentToken = tokens[i]; + const currentToken = tk; const gap = currentToken.from - prevToken.to; if (gap > 0) { @@ -1826,13 +1986,13 @@ class LezerFlowParser { // Preserve original spacing by checking token position gaps if (shapeText && i > startIndex + 1) { const prevToken = tokens[i - 1]; - const currentToken = tokens[i]; + const currentToken = tk; const gap = currentToken.from - prevToken.to; if (gap > 0) { // Preserve original spacing (gap represents number of spaces) shapeText += ' '.repeat(gap); - } else if (this.shouldAddSpaceBetweenTokens(shapeText, tokens[i].value, tokens[i].type)) { + } else if (this.shouldAddSpaceBetweenTokens(shapeText, tk.value, tk.type)) { // Fall back to smart spacing if no gap shapeText += ' '; } @@ -1840,16 +2000,16 @@ class LezerFlowParser { // Special handling for ellipse shapes: if this is the last token and it ends with '-', // strip the trailing hyphen as it's part of the shape syntax (-text-) - let tokenValue = tokens[i].value; + let tokenValue = tk.value; if ( actualShapeType === 'EllipseStart' && - tokens[i].type === 'NODE_STRING' && + tk.type === 'NODE_STRING' && tokenValue.endsWith('-') && (i + 1 >= tokens.length || possibleEndTokens.includes(tokens[i + 1].type)) ) { tokenValue = tokenValue.slice(0, -1); // Remove trailing hyphen console.log( - `UIO DEBUG: Stripped trailing hyphen from ellipse text: "${tokens[i].value}" -> "${tokenValue}"` + `UIO DEBUG: Stripped trailing hyphen from ellipse text: "${tk.value}" -> "${tokenValue}"` ); } @@ -1857,6 +2017,11 @@ class LezerFlowParser { i++; } + // If we are still in a string when the shape ends or input ends, error + if (inString) { + throw new Error("Expecting 'SQE'"); + } + // Special handling for ellipse end: need to skip the final hyphen if ( actualShapeType === 'EllipseStart' && // Skip the final hyphen before the closing parenthesis @@ -1866,14 +2031,16 @@ class LezerFlowParser { i++; } - // Capture the actual end token for shape mapping - let actualEndToken = ''; - if (i < tokens.length) { - actualEndToken = tokens[i].type; + // If we ran out of tokens before encountering the shape end, throw to avoid hanging + if (i >= tokens.length) { + throw new Error('Unexpected end of input'); } + // Capture the actual end token for shape mapping + const actualEndToken = tokens[i].type; + // Skip the shape end delimiter - if (i < tokens.length && tokens[i].type === shapeEndType) { + if (tokens[i].type === shapeEndType) { i++; } @@ -4023,6 +4190,8 @@ class LezerFlowParser { /^<=+$/, // <==, <===, etc. /^[ox]-+$/, // o--, x--, etc. /^-+[ox]$/, // --o, --x, etc. + /^[ox]=+$/, // o==, x==, etc. (thick open with head) + /^=+[ox]$/, // ==o, ==x, etc. (thick close with head) /^<-\.$/, // <-. /^\.->$/, // .-> /^=+$/, // open thick continuation (==, ===) @@ -4913,6 +5082,188 @@ class LezerFlowParser { return i; } + + /** + * Parse accTitle: single-line accessibility title + */ + private parseAccTitleStatement( + tokens: { type: string; value: string; from: number; to: number }[], + startIndex: number + ): number { + let i = startIndex; + // Consume 'accTitle' + i++; + // Optional ':' which may come as a generic token (⚠) with value ':' + if (i < tokens.length && tokens[i].value.trim() === ':') { + i++; + } + + // Collect text until semicolon or statement boundary/newline gap + let title = ''; + while (i < tokens.length) { + const t = tokens[i]; + if (t.type === 'SEMI') { + i++; + break; + } + // Stop on obvious statement starters/structural tokens + if ( + ['GRAPH', 'SUBGRAPH', 'STYLE', 'CLASSDEF', 'CLASS', 'LINKSTYLE', 'CLICK'].includes( + t.type + ) || + t.type === 'AMP' || + t.type === 'LINK' || + t.type === 'Arrow' + ) { + break; + } + // Stop if large gap (newline) and we already collected some text + if (title.length > 0 && i > startIndex + 1) { + const prev = tokens[i - 1]; + const gap = t.from - prev.to; + if (gap > 5) { + break; + } + } + + // Append with spacing rules + if (title.length === 0) { + title = t.value; + } else { + if (this.shouldAddSpaceBetweenTokens(title, t.value, t.type)) { + title += ' ' + t.value; + } else { + title += t.value; + } + } + i++; + } + + title = title.trim(); + if (this.yy && typeof (this.yy as any).setAccTitle === 'function') { + (this.yy as any).setAccTitle(title); + } + + return i; + } + + /** + * Parse accDescr: single-line or block form with braces + */ + private parseAccDescrStatement( + tokens: { type: string; value: string; from: number; to: number }[], + startIndex: number + ): number { + let i = startIndex; + // Consume 'accDescr' + i++; + + // Optional ':' which may come as a generic token (⚠) with value ':' + if (i < tokens.length && tokens[i].value.trim() === ':') { + i++; + } + + // Block form if next token is DiamondStart ("{") + if (i < tokens.length && tokens[i].type === 'DiamondStart') { + const blockStart = tokens[i]; // '{' + i++; + // Find matching DiamondEnd ("}") + let j = i; + let blockEndIndex = -1; + while (j < tokens.length) { + if (tokens[j].type === 'DiamondEnd') { + blockEndIndex = j; + break; + } + j++; + } + if (blockEndIndex === -1) { + // No closing brace; fall back to single-line accumulation + return this.parseAccDescrSingleLine(tokens, i); + } + + // Extract substring from original source preserving newlines, trim indentation and empty lines + const startPos = blockStart.to; // position right after '{' + const endPos = tokens[blockEndIndex].from; // position right before '}' + let raw = ''; + try { + raw = this.originalSource.slice(startPos, endPos); + } catch (e) { + // Fallback to token concat if something goes wrong + return this.parseAccDescrSingleLine(tokens, i); + } + + const lines = raw + .split(/\r?\n/) + .map((ln) => ln.trim()) + .filter((ln) => ln.length > 0); + const descr = lines.join('\n'); + + if (this.yy && typeof (this.yy as any).setAccDescription === 'function') { + (this.yy as any).setAccDescription(descr); + } + + // Move index past the closing brace + return blockEndIndex + 1; + } + + // Otherwise, treat as single-line form + return this.parseAccDescrSingleLine(tokens, i); + } + + private parseAccDescrSingleLine( + tokens: { type: string; value: string; from: number; to: number }[], + startIndex: number + ): number { + let i = startIndex; + let descr = ''; + + while (i < tokens.length) { + const t = tokens[i]; + if (t.type === 'SEMI') { + i++; + break; + } + // Stop at obvious statement boundaries + if ( + ['GRAPH', 'SUBGRAPH', 'STYLE', 'CLASSDEF', 'CLASS', 'LINKSTYLE', 'CLICK'].includes( + t.type + ) || + t.type === 'AMP' || + t.type === 'LINK' || + t.type === 'Arrow' + ) { + break; + } + + // Stop if large gap (newline) and we already collected some text + if (descr.length > 0) { + const prev = tokens[i - 1]; + const gap = t.from - prev.to; + if (gap > 5) { + break; + } + } + + if (descr.length === 0) { + descr = t.value; + } else { + if (this.shouldAddSpaceBetweenTokens(descr, t.value, t.type)) { + descr += ' ' + t.value; + } else { + descr += t.value; + } + } + i++; + } + + descr = descr.trim(); + if (this.yy && typeof (this.yy as any).setAccDescription === 'function') { + (this.yy as any).setAccDescription(descr); + } + + return i; + } } // Create parser instance