WIP 3

2025-11-19 20:24:16 +01:00 · 2025-08-09 15:46:30 +02:00
parent a07cdd8b11
commit 98904fbf66
2 changed files with 402 additions and 45 deletions
--- a/packages/mermaid/src/diagrams/flowchart/parser/flow.grammar
+++ b/packages/mermaid/src/diagrams/flowchart/parser/flow.grammar
@@ -120,11 +120,17 @@ InvTrapEnd { invTrapEnd }
    "<--" $[-]* $[-xo>] |     // < + 2+ dashes + ending
    "--" $[-]* $[-xo>] |      // 2+ dashes + ending (includes --> and ---)

-    // Edge text start patterns - for patterns like A<-- text -->B
+    // Edge text start patterns - for patterns like A<-- text -->B and A x== text ==x B
    // These need to be separate from complete arrows to handle edge text properly
    "<--" |                   // Left-pointing edge text start (matches START_LINK)
    "<==" |                   // Left-pointing thick edge text start
    "<-." |                   // Left-pointing dotted edge text start (matches START_DOTTED_LINK)
+    "x--" |                   // Cross head open normal start (A x-- text --x B)
+    "o--" |                   // Circle head open normal start (A o-- text --o B)
+    "x==" |                   // Cross head open thick start (A x== text ==x B)
+    "o==" |                   // Circle head open thick start (A o== text ==o B)
+    "x-." |                   // Cross head open dotted start (A x-. text .-x B)
+    "o-." |                   // Circle head open dotted start (A o-. text .-o B)

    // Thick arrows - JISON: [xo<]?\=\=+[=xo>]
    // Optional left head + 2+ equals + right ending
--- a/packages/mermaid/src/diagrams/flowchart/parser/flowParser.ts
+++ b/packages/mermaid/src/diagrams/flowchart/parser/flowParser.ts
@@ -100,6 +100,7 @@ class LezerFlowParser {
    edgeType: string;
    edgeStroke: string;
  } | null = null; // Track last edge for retroactive target chaining
+  private originalSource = '';

  constructor() {
    this.yy = undefined;
@@ -122,6 +123,9 @@ class LezerFlowParser {

      log.debug('UIO Parsing flowchart with Lezer:', newSrc);

+      // Keep a copy of the original source for substring extraction
+      this.originalSource = newSrc;
+
      // Parse with Lezer
      const tree = lezerParser.parse(newSrc);

@@ -169,6 +173,15 @@ class LezerFlowParser {
    const processedTokens: { type: string; value: string; from: number; to: number }[] = [];
    let i = 0;

+    // Helper: detect head-open tokens like x--, o--, x==, o==, x-., o-.
+    const isHeadOpenToken = (val: string) =>
+      val === 'x--' ||
+      val === 'o--' ||
+      val === 'x==' ||
+      val === 'o==' ||
+      val === 'x-.' ||
+      val === 'o-.';
+
    while (i < tokens.length) {
      const token = tokens[i];

@@ -179,6 +192,31 @@ class LezerFlowParser {
        continue;
      }

+      // Convert NODE_STRING head-open tokens (x--, o--, x==, o==, x-., o-.) into LINK when used as arrow openers
+      if (token.type === 'NODE_STRING' && isHeadOpenToken(token.value)) {
+        // Require a plausible source node immediately before in the processed stream
+        const prev = processedTokens[processedTokens.length - 1];
+        // Look ahead for a closing LINK that ends with matching head (x/o)
+        const head = token.value[0]; // 'x' or 'o'
+        let hasClosingTail = false;
+        for (let j = i + 1; j < Math.min(tokens.length, i + 6); j++) {
+          const t = tokens[j];
+          if (t.type === 'LINK' && (t.value.endsWith(head) || t.value.endsWith('>'))) {
+            hasClosingTail = true;
+            break;
+          }
+        }
+        if (prev && (prev.type === 'Identifier' || prev.type === 'NODE_STRING') && hasClosingTail) {
+          const converted = { ...token, type: 'LINK' };
+          console.log(
+            `UIO DEBUG: Converted head-open token ${token.value} to LINK for double-ended arrow`
+          );
+          processedTokens.push(converted);
+          i++;
+          continue;
+        }
+      }
+
      // Try to detect fragmented edge patterns
      const mergedPattern = this.tryMergeFragmentedEdgePattern(tokens, i);
      if (mergedPattern) {
@@ -213,25 +251,8 @@ class LezerFlowParser {
    // 2. A-- text including URL space and send -->B
    // 3. A---|text|B (pipe-delimited)

-    // Check for simple edge pattern first (A---B, A--xB, etc.)
-    // But only if it's not part of a pipe-delimited pattern
-    if (
-      this.isSimpleEdgePattern(tokens[startIndex]) &&
-      !this.isPartOfPipeDelimitedPattern(tokens, startIndex)
-    ) {
-      const patternTokens = [tokens[startIndex]];
-      console.log(
-        `UIO DEBUG: Analyzing simple edge pattern: ${patternTokens.map((t) => t.value).join(' ')}`
-      );
-
-      const merged = this.detectAndMergeEdgePattern(patternTokens, tokens, startIndex);
-      if (merged) {
-        return {
-          mergedTokens: merged,
-          nextIndex: startIndex + 1,
-        };
-      }
-    }
+    // Defer simple one-token edge merging until after checking for pipe or text-between-arrows
+    // This ensures patterns like A--text ... -->B are handled as text, not as A -- text.

    // Check for pipe-delimited pattern (A---|text|B)
    if (this.isPipeDelimitedEdgePattern(tokens, startIndex)) {
@@ -282,6 +303,20 @@ class LezerFlowParser {
      return null; // Not a complex edge pattern
    }

+    // Special handling: if this looks like A--text ... -->B or A-- text ... -->B,
+    // fall back to Pattern1/Pattern2 detection so we retain the text.
+    // This helps edge text without pipes.
+    {
+      const slice = tokens.slice(startIndex, endIndex);
+      const merged = this.detectAndMergeEdgePattern(slice, tokens, startIndex);
+      if (merged) {
+        return {
+          mergedTokens: merged,
+          nextIndex: endIndex,
+        } as any; // Will be handled by caller above
+      }
+    }
+
    // Extract the tokens that form this edge pattern
    const patternTokens = tokens.slice(startIndex, endIndex);
    console.log(
@@ -1038,18 +1073,34 @@ class LezerFlowParser {
        case 'RectStart':
        case 'TrapStart':
        case 'InvTrapStart':
-        case 'TagEnd': // Odd shape start ('>text]')
-          // Handle orphaned shape tokens (shape tokens without preceding node ID)
-          // Check if we have a pending shaped target ID from an embedded arrow edge
+        case 'TagEnd': // Odd shape start ('>text]') or split-arrow head ('>')
+          // Priority 1: If we have a pending shaped target from an embedded arrow, consume as shaped node now
          if (this.pendingShapedTargetId) {
            console.log(
              `UIO DEBUG: Applying shape to pending target node: ${this.pendingShapedTargetId}`
            );
            i = this.parseShapedNodeForTarget(tokens, i, this.pendingShapedTargetId);
            this.pendingShapedTargetId = null; // Clear the pending target
-          } else {
-            i = this.parseStatement(tokens, i);
+            break;
          }
+
+          // Priority 2: Orphaned shape token for the last referenced node (e.g., A-->B>text])
+          if (this.isShapeStart(token) && this.lastReferencedNodeId) {
+            console.log(
+              `UIO DEBUG: Detected orphaned shape token '${token.type}:${token.value}' for lastReferencedNodeId=${this.lastReferencedNodeId}`
+            );
+            i = this.parseOrphanedShapeStatement(tokens, i);
+            break;
+          }
+
+          // Priority 3: Continuation edge head (e.g., A-->B-->C)
+          if (token.type === 'TagEnd' && token.value === '>' && this.lastTargetNodes.length > 0) {
+            i = this.parseContinuationEdgeStatement(tokens, i);
+            break;
+          }
+
+          // Fallback: Delegate to parseStatement
+          i = this.parseStatement(tokens, i);
          break;
        case 'CLICK':
          i = this.parseClickStatement(tokens, i);
@@ -1185,6 +1236,19 @@ class LezerFlowParser {
      lookahead.map((t) => `${t.type}:${t.value}`)
    );

+    // Accessibility statements: accTitle / accDescr
+    if (
+      lookahead.length >= 1 &&
+      lookahead[0].type === 'NODE_STRING' &&
+      (lookahead[0].value === 'accTitle' || lookahead[0].value === 'accDescr')
+    ) {
+      if (lookahead[0].value === 'accTitle') {
+        return this.parseAccTitleStatement(tokens, i);
+      } else {
+        return this.parseAccDescrStatement(tokens, i);
+      }
+    }
+
    // Check if this is a direction statement (direction BT)
    if (
      lookahead.length >= 2 &&
@@ -1281,7 +1345,7 @@ class LezerFlowParser {
    // Check if this is an edge (A --> B pattern or A(text) --> B pattern)
    // Check for orphaned shape tokens (shape tokens without preceding node ID) FIRST
    // This happens when an edge creates a target node but leaves the shape tokens for later processing
-    if (lookahead.length >= 3 && this.isShapeStart(lookahead[0].type)) {
+    if (lookahead.length >= 3 && this.isShapeStart(lookahead[0])) {
      console.log(`UIO DEBUG: Taking orphaned shape statement path (shape without node ID)`);
      return this.parseOrphanedShapeStatement(tokens, i);
    }
@@ -1633,11 +1697,14 @@ class LezerFlowParser {
  }

  /**
-   * Check if a token type represents a shape start delimiter
-   * @param tokenType - The token type to check
-   * @returns True if it's a shape start delimiter
+   * Check if a token represents a shape start delimiter
+   * Accepts either a token object or a token type string for backward compatibility
   */
-  private isShapeStart(tokenType: string): boolean {
+  private isShapeStart(tokenOrType: { type: string; value: string } | string): boolean {
+    const type = typeof tokenOrType === 'string' ? tokenOrType : tokenOrType.type;
+    const val = typeof tokenOrType === 'string' ? '' : tokenOrType.value;
+
+    // Base shape starts by token type
    const shapeStarts = [
      'SquareStart', // [
      'ParenStart', // (
@@ -1650,7 +1717,18 @@ class LezerFlowParser {
      'InvTrapStart', // [\
      'TagEnd', // > (for odd shapes)
    ];
-    return shapeStarts.includes(tokenType);
+
+    if (shapeStarts.includes(type)) {
+      return true;
+    }
+
+    // Some punctuation comes through as generic '⚠' tokens in the lexer
+    // Treat '⚠' with value '>' as an odd-shape start
+    if (type === '⚠' && val === '>') {
+      return true;
+    }
+
+    return false;
  }

  /**
@@ -1775,21 +1853,103 @@ class LezerFlowParser {
      }
    }

+    // Track string parsing state inside shape text
+    let inString = false;
+    let stringQuote: '"' | "'" | null = null;
+    let seenStr = false; // saw a single quoted string token as entire text
+    const sawEllipseCloseHyphen = false; // for ellipse (-text-)
+
    // Collect all tokens until we find any valid shape end delimiter
    while (i < tokens.length && !possibleEndTokens.includes(tokens[i].type)) {
+      const tk = tokens[i];
+
+      // If we get a complete quoted string token (STR), allow it only if it's the only content
+      if (tk.type === 'STR') {
+        if (shapeText.trim().length > 0 || seenStr) {
+          throw new Error("got 'STR'");
+        }
+        shapeText += tk.value; // keep quotes; processNodeText will strip and classify
+        seenStr = true;
+        i++;
+        continue;
+      }
+
      // For ellipse shapes, stop when we encounter the closing hyphen
-      if (actualShapeType === 'EllipseStart' && tokens[i].type === 'Hyphen') {
+      if (actualShapeType === 'EllipseStart' && tk.type === 'Hyphen') {
        break; // This is the closing hyphen, don't include it in the text
      }

+      // If a full STR was consumed as the only text, parentheses should trigger SQE (legacy)
+      if (
+        seenStr &&
+        (tk.type === 'ParenStart' || tk.type === 'ParenEnd' || tk.value === '(' || tk.value === ')')
+      ) {
+        throw new Error("Expecting 'SQE'");
+      }
+
+      // Quote handling - mirror legacy JISON error behavior
+      const isQuoteToken =
+        tk.type === 'STR' ||
+        tk.type === 'SQS' ||
+        tk.type === 'SQE' ||
+        tk.type === 'DQS' ||
+        tk.type === 'DQE' ||
+        (tk.type === '⚠' && (tk.value === '"' || tk.value === "'"));
+
+      if (isQuoteToken) {
+        const quoteChar: '"' | "'" = tk.value === "'" ? "'" : '"';
+
+        if (!inString) {
+          // If there is already plain text before a quote, error: mixing text and string
+          if (shapeText.trim().length > 0) {
+            throw new Error("got 'STR'");
+          }
+          // Enter string mode; do not include quote char itself in text
+          inString = true;
+          stringQuote = quoteChar;
+          i++;
+          continue;
+        } else {
+          // Already inside a string
+          if (stringQuote === quoteChar) {
+            // Closing the string
+            inString = false;
+            stringQuote = null;
+            i++;
+            continue;
+          } else {
+            // Nested/mismatched quote inside string
+            throw new Error("Expecting 'SQE'");
+          }
+        }
+      }
+
+      // If inside a string, any parentheses should trigger the SQE error (unterminated string expected)
+      if (
+        inString &&
+        (tk.type === 'ParenStart' || tk.type === 'ParenEnd' || tk.value === '(' || tk.value === ')')
+      ) {
+        throw new Error("Expecting 'SQE'");
+      }
+
+      // In square/rect shapes, parentheses are not allowed within text (legacy behavior)
+      if ((actualShapeType === 'SquareStart' || actualShapeType === 'RectStart') && !inString) {
+        if (tk.type === 'ParenStart' || tk.value === '(') {
+          throw new Error("got 'PS'");
+        }
+        if (tk.type === 'ParenEnd' || tk.value === ')') {
+          throw new Error("got 'PE'");
+        }
+      }
+
      // Note: We don't stop for statement keywords when inside shape delimiters
      // Keywords like 'linkStyle', 'classDef', etc. should be treated as regular text
      // when they appear inside shapes like [linkStyle] or (classDef)

      // Check for HTML tag pattern: < + tag_name + >
      if (
-        tokens[i].type === '⚠' &&
-        tokens[i].value === '<' &&
+        tk.type === '⚠' &&
+        tk.value === '<' &&
        i + 2 < tokens.length &&
        !possibleEndTokens.includes(tokens[i + 1].type)
      ) {
@@ -1803,7 +1963,7 @@ class LezerFlowParser {
          // Preserve original spacing before HTML tag
          if (shapeText && i > startIndex + 1) {
            const prevToken = tokens[i - 1];
-            const currentToken = tokens[i];
+            const currentToken = tk;
            const gap = currentToken.from - prevToken.to;

            if (gap > 0) {
@@ -1826,13 +1986,13 @@ class LezerFlowParser {
      // Preserve original spacing by checking token position gaps
      if (shapeText && i > startIndex + 1) {
        const prevToken = tokens[i - 1];
-        const currentToken = tokens[i];
+        const currentToken = tk;
        const gap = currentToken.from - prevToken.to;

        if (gap > 0) {
          // Preserve original spacing (gap represents number of spaces)
          shapeText += ' '.repeat(gap);
-        } else if (this.shouldAddSpaceBetweenTokens(shapeText, tokens[i].value, tokens[i].type)) {
+        } else if (this.shouldAddSpaceBetweenTokens(shapeText, tk.value, tk.type)) {
          // Fall back to smart spacing if no gap
          shapeText += ' ';
        }
@@ -1840,16 +2000,16 @@ class LezerFlowParser {

      // Special handling for ellipse shapes: if this is the last token and it ends with '-',
      // strip the trailing hyphen as it's part of the shape syntax (-text-)
-      let tokenValue = tokens[i].value;
+      let tokenValue = tk.value;
      if (
        actualShapeType === 'EllipseStart' &&
-        tokens[i].type === 'NODE_STRING' &&
+        tk.type === 'NODE_STRING' &&
        tokenValue.endsWith('-') &&
        (i + 1 >= tokens.length || possibleEndTokens.includes(tokens[i + 1].type))
      ) {
        tokenValue = tokenValue.slice(0, -1); // Remove trailing hyphen
        console.log(
-          `UIO DEBUG: Stripped trailing hyphen from ellipse text: "${tokens[i].value}" -> "${tokenValue}"`
+          `UIO DEBUG: Stripped trailing hyphen from ellipse text: "${tk.value}" -> "${tokenValue}"`
        );
      }

@@ -1857,6 +2017,11 @@ class LezerFlowParser {
      i++;
    }

+    // If we are still in a string when the shape ends or input ends, error
+    if (inString) {
+      throw new Error("Expecting 'SQE'");
+    }
+
    // Special handling for ellipse end: need to skip the final hyphen
    if (
      actualShapeType === 'EllipseStart' && // Skip the final hyphen before the closing parenthesis
@@ -1866,14 +2031,16 @@ class LezerFlowParser {
      i++;
    }

-    // Capture the actual end token for shape mapping
-    let actualEndToken = '';
-    if (i < tokens.length) {
-      actualEndToken = tokens[i].type;
+    // If we ran out of tokens before encountering the shape end, throw to avoid hanging
+    if (i >= tokens.length) {
+      throw new Error('Unexpected end of input');
    }

+    // Capture the actual end token for shape mapping
+    const actualEndToken = tokens[i].type;
+
    // Skip the shape end delimiter
-    if (i < tokens.length && tokens[i].type === shapeEndType) {
+    if (tokens[i].type === shapeEndType) {
      i++;
    }

@@ -4023,6 +4190,8 @@ class LezerFlowParser {
      /^<=+$/, // <==, <===, etc.
      /^[ox]-+$/, // o--, x--, etc.
      /^-+[ox]$/, // --o, --x, etc.
+      /^[ox]=+$/, // o==, x==, etc. (thick open with head)
+      /^=+[ox]$/, // ==o, ==x, etc. (thick close with head)
      /^<-\.$/, // <-.
      /^\.->$/, // .->
      /^=+$/, // open thick continuation (==, ===)
@@ -4913,6 +5082,188 @@ class LezerFlowParser {

    return i;
  }
+
+  /**
+   * Parse accTitle: single-line accessibility title
+   */
+  private parseAccTitleStatement(
+    tokens: { type: string; value: string; from: number; to: number }[],
+    startIndex: number
+  ): number {
+    let i = startIndex;
+    // Consume 'accTitle'
+    i++;
+    // Optional ':' which may come as a generic token (⚠) with value ':'
+    if (i < tokens.length && tokens[i].value.trim() === ':') {
+      i++;
+    }
+
+    // Collect text until semicolon or statement boundary/newline gap
+    let title = '';
+    while (i < tokens.length) {
+      const t = tokens[i];
+      if (t.type === 'SEMI') {
+        i++;
+        break;
+      }
+      // Stop on obvious statement starters/structural tokens
+      if (
+        ['GRAPH', 'SUBGRAPH', 'STYLE', 'CLASSDEF', 'CLASS', 'LINKSTYLE', 'CLICK'].includes(
+          t.type
+        ) ||
+        t.type === 'AMP' ||
+        t.type === 'LINK' ||
+        t.type === 'Arrow'
+      ) {
+        break;
+      }
+      // Stop if large gap (newline) and we already collected some text
+      if (title.length > 0 && i > startIndex + 1) {
+        const prev = tokens[i - 1];
+        const gap = t.from - prev.to;
+        if (gap > 5) {
+          break;
+        }
+      }
+
+      // Append with spacing rules
+      if (title.length === 0) {
+        title = t.value;
+      } else {
+        if (this.shouldAddSpaceBetweenTokens(title, t.value, t.type)) {
+          title += ' ' + t.value;
+        } else {
+          title += t.value;
+        }
+      }
+      i++;
+    }
+
+    title = title.trim();
+    if (this.yy && typeof (this.yy as any).setAccTitle === 'function') {
+      (this.yy as any).setAccTitle(title);
+    }
+
+    return i;
+  }
+
+  /**
+   * Parse accDescr: single-line or block form with braces
+   */
+  private parseAccDescrStatement(
+    tokens: { type: string; value: string; from: number; to: number }[],
+    startIndex: number
+  ): number {
+    let i = startIndex;
+    // Consume 'accDescr'
+    i++;
+
+    // Optional ':' which may come as a generic token (⚠) with value ':'
+    if (i < tokens.length && tokens[i].value.trim() === ':') {
+      i++;
+    }
+
+    // Block form if next token is DiamondStart ("{")
+    if (i < tokens.length && tokens[i].type === 'DiamondStart') {
+      const blockStart = tokens[i]; // '{'
+      i++;
+      // Find matching DiamondEnd ("}")
+      let j = i;
+      let blockEndIndex = -1;
+      while (j < tokens.length) {
+        if (tokens[j].type === 'DiamondEnd') {
+          blockEndIndex = j;
+          break;
+        }
+        j++;
+      }
+      if (blockEndIndex === -1) {
+        // No closing brace; fall back to single-line accumulation
+        return this.parseAccDescrSingleLine(tokens, i);
+      }
+
+      // Extract substring from original source preserving newlines, trim indentation and empty lines
+      const startPos = blockStart.to; // position right after '{'
+      const endPos = tokens[blockEndIndex].from; // position right before '}'
+      let raw = '';
+      try {
+        raw = this.originalSource.slice(startPos, endPos);
+      } catch (e) {
+        // Fallback to token concat if something goes wrong
+        return this.parseAccDescrSingleLine(tokens, i);
+      }
+
+      const lines = raw
+        .split(/\r?\n/)
+        .map((ln) => ln.trim())
+        .filter((ln) => ln.length > 0);
+      const descr = lines.join('\n');
+
+      if (this.yy && typeof (this.yy as any).setAccDescription === 'function') {
+        (this.yy as any).setAccDescription(descr);
+      }
+
+      // Move index past the closing brace
+      return blockEndIndex + 1;
+    }
+
+    // Otherwise, treat as single-line form
+    return this.parseAccDescrSingleLine(tokens, i);
+  }
+
+  private parseAccDescrSingleLine(
+    tokens: { type: string; value: string; from: number; to: number }[],
+    startIndex: number
+  ): number {
+    let i = startIndex;
+    let descr = '';
+
+    while (i < tokens.length) {
+      const t = tokens[i];
+      if (t.type === 'SEMI') {
+        i++;
+        break;
+      }
+      // Stop at obvious statement boundaries
+      if (
+        ['GRAPH', 'SUBGRAPH', 'STYLE', 'CLASSDEF', 'CLASS', 'LINKSTYLE', 'CLICK'].includes(
+          t.type
+        ) ||
+        t.type === 'AMP' ||
+        t.type === 'LINK' ||
+        t.type === 'Arrow'
+      ) {
+        break;
+      }
+
+      // Stop if large gap (newline) and we already collected some text
+      if (descr.length > 0) {
+        const prev = tokens[i - 1];
+        const gap = t.from - prev.to;
+        if (gap > 5) {
+          break;
+        }
+      }
+
+      if (descr.length === 0) {
+        descr = t.value;
+      } else {
+        if (this.shouldAddSpaceBetweenTokens(descr, t.value, t.type)) {
+          descr += ' ' + t.value;
+        } else {
+          descr += t.value;
+        }
+      }
+      i++;
+    }
+
+    descr = descr.trim();
+    if (this.yy && typeof (this.yy as any).setAccDescription === 'function') {
+      (this.yy as any).setAccDescription(descr);
+    }
+
+    return i;
+  }
 }

 // Create parser instance